diff options
| author | daoge_cmd <3523206925@qq.com> | 2026-03-01 12:16:08 +0800 |
|---|---|---|
| committer | daoge_cmd <3523206925@qq.com> | 2026-03-01 12:16:08 +0800 |
| commit | b691c43c44ff180d10e7d4a9afc83b98551ff586 (patch) | |
| tree | 3e9849222cbc6ba49f2f1fc6e5fe7179632c7390 /Minecraft.Client/PS3/PS3Extras/DirectX | |
| parent | def8cb415354ac390b7e89052a50605285f1aca9 (diff) | |
Initial commit
Diffstat (limited to 'Minecraft.Client/PS3/PS3Extras/DirectX')
| -rw-r--r-- | Minecraft.Client/PS3/PS3Extras/DirectX/DirectXCollision.h | 339 | ||||
| -rw-r--r-- | Minecraft.Client/PS3/PS3Extras/DirectX/DirectXCollision.inl | 4801 | ||||
| -rw-r--r-- | Minecraft.Client/PS3/PS3Extras/DirectX/DirectXColors.h | 168 | ||||
| -rw-r--r-- | Minecraft.Client/PS3/PS3Extras/DirectX/DirectXMath.h | 1861 | ||||
| -rw-r--r-- | Minecraft.Client/PS3/PS3Extras/DirectX/DirectXMathConvert.inl | 1962 | ||||
| -rw-r--r-- | Minecraft.Client/PS3/PS3Extras/DirectX/DirectXMathMatrix.inl | 3414 | ||||
| -rw-r--r-- | Minecraft.Client/PS3/PS3Extras/DirectX/DirectXMathMisc.inl | 2501 | ||||
| -rw-r--r-- | Minecraft.Client/PS3/PS3Extras/DirectX/DirectXMathVector.inl | 10596 | ||||
| -rw-r--r-- | Minecraft.Client/PS3/PS3Extras/DirectX/DirectXPackedVector.h | 995 | ||||
| -rw-r--r-- | Minecraft.Client/PS3/PS3Extras/DirectX/DirectXPackedVector.inl | 3545 | ||||
| -rw-r--r-- | Minecraft.Client/PS3/PS3Extras/DirectX/no_sal2.h | 1022 | ||||
| -rw-r--r-- | Minecraft.Client/PS3/PS3Extras/DirectX/sal.h | 1998 |
12 files changed, 33202 insertions, 0 deletions
diff --git a/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXCollision.h b/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXCollision.h new file mode 100644 index 00000000..d411432a --- /dev/null +++ b/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXCollision.h @@ -0,0 +1,339 @@ +//------------------------------------------------------------------------------------- +// DirectXCollision.h -- C++ Collision Math library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------------------------------- + +#ifdef _MSC_VER +#pragma once +#endif + +#include "DirectXMath.h" + +namespace DirectX +{ + +enum ContainmentType +{ + DISJOINT = 0, + INTERSECTS = 1, + CONTAINS = 2, +}; + +enum PlaneIntersectionType +{ + FRONT = 0, + INTERSECTING = 1, + BACK = 2, +}; + +struct BoundingBox; +struct BoundingOrientedBox; +struct BoundingFrustum; + +#pragma warning(push) +#pragma warning(disable:4324 4820) + +//------------------------------------------------------------------------------------- +// Bounding sphere +//------------------------------------------------------------------------------------- +struct BoundingSphere +{ + XMFLOAT3 Center; // Center of the sphere. + float Radius; // Radius of the sphere. + + // Creators + BoundingSphere() : Center(0,0,0), Radius( 1.f ) {} + BoundingSphere( _In_ const XMFLOAT3& center, _In_ float radius ) + : Center(center), Radius(radius) { assert( radius >= 0.f ); }; + BoundingSphere( _In_ const BoundingSphere& sp ) + : Center(sp.Center), Radius(sp.Radius) {} + + // Methods + BoundingSphere& operator=( _In_ const BoundingSphere& sp ) { Center = sp.Center; Radius = sp.Radius; return *this; } + + void Transform( _Out_ BoundingSphere& Out, _In_ CXMMATRIX M ) const; + void Transform( _Out_ BoundingSphere& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation ) const; + // Transform the sphere + + ContainmentType Contains( _In_ FXMVECTOR Point ) const; + ContainmentType Contains( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; + ContainmentType Contains( _In_ const BoundingSphere& sh ) const; + ContainmentType Contains( _In_ const BoundingBox& box ) const; + ContainmentType Contains( _In_ const BoundingOrientedBox& box ) const; + ContainmentType Contains( _In_ const BoundingFrustum& fr ) const; + + bool Intersects( _In_ const BoundingSphere& sh ) const; + bool Intersects( _In_ const BoundingBox& box ) const; + bool Intersects( _In_ const BoundingOrientedBox& box ) const; + bool Intersects( _In_ const BoundingFrustum& fr ) const; + + bool Intersects( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; + // Triangle-sphere test + + PlaneIntersectionType Intersects( _In_ FXMVECTOR Plane ) const; + // Plane-sphere test + + bool Intersects( _In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _Out_ float& Dist ) const; + // Ray-sphere test + + ContainmentType ContainedBy( _In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2, + _In_ GXMVECTOR Plane3, _In_ CXMVECTOR Plane4, _In_ CXMVECTOR Plane5 ) const; + // Test sphere against six planes (see BoundingFrustum::GetPlanes) + + // Static methods + static void CreateMerged( _Out_ BoundingSphere& Out, _In_ const BoundingSphere& S1, _In_ const BoundingSphere& S2 ); + + static void CreateFromBoundingBox( _Out_ BoundingSphere& Out, _In_ const BoundingBox& box ); + static void CreateFromBoundingBox( _Out_ BoundingSphere& Out, _In_ const BoundingOrientedBox& box ); + + static void CreateFromPoints( _Out_ BoundingSphere& Out, _In_ size_t Count, + _In_reads_bytes_(sizeof(XMFLOAT3)+Stride*(Count-1)) const XMFLOAT3* pPoints, _In_ size_t Stride ); + + static void CreateFromFrustum( _Out_ BoundingSphere& Out, _In_ const BoundingFrustum& fr ); +}; + +//------------------------------------------------------------------------------------- +// Axis-aligned bounding box +//------------------------------------------------------------------------------------- +struct BoundingBox +{ + static const size_t CORNER_COUNT = 8; + + XMFLOAT3 Center; // Center of the box. + XMFLOAT3 Extents; // Distance from the center to each side. + + // Creators + BoundingBox() : Center(0,0,0), Extents( 1.f, 1.f, 1.f ) {} + BoundingBox( _In_ const XMFLOAT3& center, _In_ const XMFLOAT3& extents ) + : Center(center), Extents(extents) { assert(extents.x >= 0 && extents.y >= 0 && extents.z >= 0); } + BoundingBox( _In_ const BoundingBox& box ) : Center(box.Center), Extents(box.Extents) {} + + // Methods + BoundingBox& operator=( _In_ const BoundingBox& box) { Center = box.Center; Extents = box.Extents; return *this; } + + void Transform( _Out_ BoundingBox& Out, _In_ CXMMATRIX M ) const; + void Transform( _Out_ BoundingBox& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation ) const; + + void GetCorners( _Out_writes_(8) XMFLOAT3* Corners ) const; + // Gets the 8 corners of the box + + ContainmentType Contains( _In_ FXMVECTOR Point ) const; + ContainmentType Contains( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; + ContainmentType Contains( _In_ const BoundingSphere& sh ) const; + ContainmentType Contains( _In_ const BoundingBox& box ) const; + ContainmentType Contains( _In_ const BoundingOrientedBox& box ) const; + ContainmentType Contains( _In_ const BoundingFrustum& fr ) const; + + bool Intersects( _In_ const BoundingSphere& sh ) const; + bool Intersects( _In_ const BoundingBox& box ) const; + bool Intersects( _In_ const BoundingOrientedBox& box ) const; + bool Intersects( _In_ const BoundingFrustum& fr ) const; + + bool Intersects( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; + // Triangle-Box test + + PlaneIntersectionType Intersects( _In_ FXMVECTOR Plane ) const; + // Plane-box test + + bool Intersects( _In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _Out_ float& Dist ) const; + // Ray-Box test + + ContainmentType ContainedBy( _In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2, + _In_ GXMVECTOR Plane3, _In_ CXMVECTOR Plane4, _In_ CXMVECTOR Plane5 ) const; + // Test box against six planes (see BoundingFrustum::GetPlanes) + + // Static methods + static void CreateMerged( _Out_ BoundingBox& Out, _In_ const BoundingBox& b1, _In_ const BoundingBox& b2 ); + + static void CreateFromSphere( _Out_ BoundingBox& Out, _In_ const BoundingSphere& sh ); + + static void CreateFromPoints( _Out_ BoundingBox& Out, _In_ FXMVECTOR pt1, _In_ FXMVECTOR pt2 ); + static void CreateFromPoints( _Out_ BoundingBox& Out, _In_ size_t Count, + _In_reads_bytes_(sizeof(XMFLOAT3)+Stride*(Count-1)) const XMFLOAT3* pPoints, _In_ size_t Stride ); +}; + +//------------------------------------------------------------------------------------- +// Oriented bounding box +//------------------------------------------------------------------------------------- +struct BoundingOrientedBox +{ + static const size_t CORNER_COUNT = 8; + + XMFLOAT3 Center; // Center of the box. + XMFLOAT3 Extents; // Distance from the center to each side. + XMFLOAT4 Orientation; // Unit quaternion representing rotation (box -> world). + + // Creators + BoundingOrientedBox() : Center(0,0,0), Extents( 1.f, 1.f, 1.f ), Orientation(0,0,0, 1.f ) {} + BoundingOrientedBox( _In_ const XMFLOAT3& _Center, _In_ const XMFLOAT3& _Extents, _In_ const XMFLOAT4& _Orientation ) + : Center(_Center), Extents(_Extents), Orientation(_Orientation) + { + assert(_Extents.x >= 0 && _Extents.y >= 0 && _Extents.z >= 0); + } + BoundingOrientedBox( _In_ const BoundingOrientedBox& box ) + : Center(box.Center), Extents(box.Extents), Orientation(box.Orientation) {} + + // Methods + BoundingOrientedBox& operator=( _In_ const BoundingOrientedBox& box ) { Center = box.Center; Extents = box.Extents; Orientation = box.Orientation; return *this; } + + void Transform( _Out_ BoundingOrientedBox& Out, _In_ CXMMATRIX M ) const; + void Transform( _Out_ BoundingOrientedBox& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation ) const; + + void GetCorners( _Out_writes_(8) XMFLOAT3* Corners ) const; + // Gets the 8 corners of the box + + ContainmentType Contains( _In_ FXMVECTOR Point ) const; + ContainmentType Contains( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; + ContainmentType Contains( _In_ const BoundingSphere& sh ) const; + ContainmentType Contains( _In_ const BoundingBox& box ) const; + ContainmentType Contains( _In_ const BoundingOrientedBox& box ) const; + ContainmentType Contains( _In_ const BoundingFrustum& fr ) const; + + bool Intersects( _In_ const BoundingSphere& sh ) const; + bool Intersects( _In_ const BoundingBox& box ) const; + bool Intersects( _In_ const BoundingOrientedBox& box ) const; + bool Intersects( _In_ const BoundingFrustum& fr ) const; + + bool Intersects( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; + // Triangle-OrientedBox test + + PlaneIntersectionType Intersects( _In_ FXMVECTOR Plane ) const; + // Plane-OrientedBox test + + bool Intersects( _In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _Out_ float& Dist ) const; + // Ray-OrientedBox test + + ContainmentType ContainedBy( _In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2, + _In_ GXMVECTOR Plane3, _In_ CXMVECTOR Plane4, _In_ CXMVECTOR Plane5 ) const; + // Test OrientedBox against six planes (see BoundingFrustum::GetPlanes) + + // Static methods + static void CreateFromBoundingBox( _Out_ BoundingOrientedBox& Out, _In_ const BoundingBox& box ); + + static void CreateFromPoints( _Out_ BoundingOrientedBox& Out, _In_ size_t Count, + _In_reads_bytes_(sizeof(XMFLOAT3)+Stride*(Count-1)) const XMFLOAT3* pPoints, _In_ size_t Stride ); +}; + +//------------------------------------------------------------------------------------- +// Bounding frustum +//------------------------------------------------------------------------------------- +struct BoundingFrustum +{ + static const size_t CORNER_COUNT = 8; + + XMFLOAT3 Origin; // Origin of the frustum (and projection). + XMFLOAT4 Orientation; // Quaternion representing rotation. + + float RightSlope; // Positive X slope (X/Z). + float LeftSlope; // Negative X slope. + float TopSlope; // Positive Y slope (Y/Z). + float BottomSlope; // Negative Y slope. + float Near, Far; // Z of the near plane and far plane. + + // Creators + BoundingFrustum() : Origin(0,0,0), Orientation(0,0,0, 1.f), RightSlope( 1.f ), LeftSlope( -1.f ), + TopSlope( 1.f ), BottomSlope( -1.f ), Near(0), Far( 1.f ) {} + BoundingFrustum( _In_ const XMFLOAT3& _Origin, _In_ const XMFLOAT4& _Orientation, + _In_ float _RightSlope, _In_ float _LeftSlope, _In_ float _TopSlope, _In_ float _BottomSlope, + _In_ float _Near, _In_ float _Far ) + : Origin(_Origin), Orientation(_Orientation), + RightSlope(_RightSlope), LeftSlope(_LeftSlope), TopSlope(_TopSlope), BottomSlope(_BottomSlope), + Near(_Near), Far(_Far) { assert( _Near <= _Far ); } + BoundingFrustum( _In_ const BoundingFrustum& fr ) + : Origin(fr.Origin), Orientation(fr.Orientation), RightSlope(fr.RightSlope), LeftSlope(fr.LeftSlope), + TopSlope(fr.TopSlope), BottomSlope(fr.BottomSlope), Near(fr.Near), Far(fr.Far) {} + BoundingFrustum( _In_ CXMMATRIX Projection ) { CreateFromMatrix( *this, Projection ); } + + // Methods + BoundingFrustum& operator=( _In_ const BoundingFrustum& fr ) { Origin=fr.Origin; Orientation=fr.Orientation; + RightSlope=fr.RightSlope; LeftSlope=fr.LeftSlope; + TopSlope=fr.TopSlope; BottomSlope=fr.BottomSlope; + Near=fr.Near; Far=fr.Far; return *this; } + + void Transform( _Out_ BoundingFrustum& Out, _In_ CXMMATRIX M ) const; + void Transform( _Out_ BoundingFrustum& Out, _In_ float Scale, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation ) const; + + void GetCorners( _Out_writes_(8) XMFLOAT3* Corners ) const; + // Gets the 8 corners of the frustum + + ContainmentType Contains( _In_ FXMVECTOR Point ) const; + ContainmentType Contains( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; + ContainmentType Contains( _In_ const BoundingSphere& sp ) const; + ContainmentType Contains( _In_ const BoundingBox& box ) const; + ContainmentType Contains( _In_ const BoundingOrientedBox& box ) const; + ContainmentType Contains( _In_ const BoundingFrustum& fr ) const; + // Frustum-Frustum test + + bool Intersects( _In_ const BoundingSphere& sh ) const; + bool Intersects( _In_ const BoundingBox& box ) const; + bool Intersects( _In_ const BoundingOrientedBox& box ) const; + bool Intersects( _In_ const BoundingFrustum& fr ) const; + + bool Intersects( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2 ) const; + // Triangle-Frustum test + + PlaneIntersectionType Intersects( _In_ FXMVECTOR Plane ) const; + // Plane-Frustum test + + bool Intersects( _In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _Out_ float& Dist ) const; + // Ray-Frustum test + + ContainmentType ContainedBy( _In_ FXMVECTOR Plane0, _In_ FXMVECTOR Plane1, _In_ FXMVECTOR Plane2, + _In_ GXMVECTOR Plane3, _In_ CXMVECTOR Plane4, _In_ CXMVECTOR Plane5 ) const; + // Test frustum against six planes (see BoundingFrustum::GetPlanes) + + void GetPlanes( _Out_opt_ XMVECTOR* NearPlane, _Out_opt_ XMVECTOR* FarPlane, _Out_opt_ XMVECTOR* RightPlane, + _Out_opt_ XMVECTOR* LeftPlane, _Out_opt_ XMVECTOR* TopPlane, _Out_opt_ XMVECTOR* BottomPlane ) const; + // Create 6 Planes representation of Frustum + + // Static methods + static void CreateFromMatrix( _Out_ BoundingFrustum& Out, _In_ CXMMATRIX Projection ); +}; + +//----------------------------------------------------------------------------- +// Triangle intersection testing routines. +//----------------------------------------------------------------------------- +namespace TriangleTests +{ + bool Intersects( _In_ FXMVECTOR Origin, _In_ FXMVECTOR Direction, _In_ FXMVECTOR V0, _In_ GXMVECTOR V1, _In_ CXMVECTOR V2, _Out_ float& Dist ); + // Ray-Triangle + + bool Intersects( _In_ FXMVECTOR A0, _In_ FXMVECTOR A1, _In_ FXMVECTOR A2, _In_ GXMVECTOR B0, _In_ CXMVECTOR B1, _In_ CXMVECTOR B2 ); + // Triangle-Triangle + + PlaneIntersectionType Intersects( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2, _In_ GXMVECTOR Plane ); + // Plane-Triangle + + ContainmentType ContainedBy( _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2, + _In_ GXMVECTOR Plane0, _In_ CXMVECTOR Plane1, _In_ CXMVECTOR Plane2, + _In_ CXMVECTOR Plane3, _In_ CXMVECTOR Plane4, _In_ CXMVECTOR Plane5 ); + // Test a triangle against six planes at once (see BoundingFrustum::GetPlanes) +}; + +#pragma warning(pop) + +/**************************************************************************** + * + * Implementation + * + ****************************************************************************/ + +#pragma warning(push) +#pragma warning(disable : 4068 4616 6001) + +#pragma prefast(push) +#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes") + +#include "DirectXCollision.inl" + +#pragma prefast(pop) +#pragma warning(pop) + +}; // namespace DirectX + diff --git a/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXCollision.inl b/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXCollision.inl new file mode 100644 index 00000000..34d44382 --- /dev/null +++ b/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXCollision.inl @@ -0,0 +1,4801 @@ +//------------------------------------------------------------------------------------- +// DirectXCollision.inl -- C++ Collision Math library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------------------------------- + +#ifdef _MSC_VER +#pragma once +#endif + +XMGLOBALCONST XMVECTORF32 g_BoxOffset[8] = +{ + { -1.0f, -1.0f, 1.0f, 0.0f }, + { 1.0f, -1.0f, 1.0f, 0.0f }, + { 1.0f, 1.0f, 1.0f, 0.0f }, + { -1.0f, 1.0f, 1.0f, 0.0f }, + { -1.0f, -1.0f, -1.0f, 0.0f }, + { 1.0f, -1.0f, -1.0f, 0.0f }, + { 1.0f, 1.0f, -1.0f, 0.0f }, + { -1.0f, 1.0f, -1.0f, 0.0f }, +}; + +XMGLOBALCONST XMVECTORF32 g_RayEpsilon = { 1e-20f, 1e-20f, 1e-20f, 1e-20f }; +XMGLOBALCONST XMVECTORF32 g_RayNegEpsilon = { -1e-20f, -1e-20f, -1e-20f, -1e-20f }; +XMGLOBALCONST XMVECTORF32 g_FltMin = { -FLT_MAX, -FLT_MAX, -FLT_MAX, -FLT_MAX }; +XMGLOBALCONST XMVECTORF32 g_FltMax = { FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX }; + +namespace Internal +{ + +//----------------------------------------------------------------------------- +// Return true if any of the elements of a 3 vector are equal to 0xffffffff. +// Slightly more efficient than using XMVector3EqualInt. +//----------------------------------------------------------------------------- +inline bool XMVector3AnyTrue( _In_ FXMVECTOR V ) +{ + // Duplicate the fourth element from the first element. + XMVECTOR C = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X>(V); + + return XMComparisonAnyTrue( XMVector4EqualIntR( C, XMVectorTrueInt() ) ); +} + + +//----------------------------------------------------------------------------- +// Return true if all of the elements of a 3 vector are equal to 0xffffffff. +// Slightly more efficient than using XMVector3EqualInt. +//----------------------------------------------------------------------------- +inline bool XMVector3AllTrue( _In_ FXMVECTOR V ) +{ + // Duplicate the fourth element from the first element. + XMVECTOR C = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X>( V ); + + return XMComparisonAllTrue( XMVector4EqualIntR( C, XMVectorTrueInt() ) ); +} + +#if defined(_PREFAST) || !defined(NDEBUG) + +XMGLOBALCONST XMVECTORF32 g_UnitVectorEpsilon = { 1.0e-4f, 1.0e-4f, 1.0e-4f, 1.0e-4f }; +XMGLOBALCONST XMVECTORF32 g_UnitQuaternionEpsilon = { 1.0e-4f, 1.0e-4f, 1.0e-4f, 1.0e-4f }; +XMGLOBALCONST XMVECTORF32 g_UnitPlaneEpsilon = { 1.0e-4f, 1.0e-4f, 1.0e-4f, 1.0e-4f }; + +//----------------------------------------------------------------------------- +// Return true if the vector is a unit vector (length == 1). +//----------------------------------------------------------------------------- +inline bool XMVector3IsUnit( _In_ FXMVECTOR V ) +{ + XMVECTOR Difference = XMVector3Length( V ) - XMVectorSplatOne(); + return XMVector4Less( XMVectorAbs( Difference ), g_UnitVectorEpsilon ); +} + +//----------------------------------------------------------------------------- +// Return true if the quaterion is a unit quaternion. +//----------------------------------------------------------------------------- +inline bool XMQuaternionIsUnit( _In_ FXMVECTOR Q ) +{ + XMVECTOR Difference = XMVector4Length( Q ) - XMVectorSplatOne(); + return XMVector4Less( XMVectorAbs( Difference ), g_UnitQuaternionEpsilon ); +} + +//----------------------------------------------------------------------------- +// Return true if the plane is a unit plane. +//----------------------------------------------------------------------------- +inline bool XMPlaneIsUnit( _In_ FXMVECTOR Plane ) +{ + XMVECTOR Difference = XMVector3Length( Plane ) - XMVectorSplatOne(); + return XMVector4Less( XMVectorAbs( Difference ), g_UnitPlaneEpsilon ); +} + +#endif // __PREFAST__ || !NDEBUG + +//----------------------------------------------------------------------------- +inline XMVECTOR XMPlaneTransform( _In_ FXMVECTOR Plane, _In_ FXMVECTOR Rotation, _In_ FXMVECTOR Translation ) +{ + XMVECTOR vNormal = XMVector3Rotate( Plane, Rotation ); + XMVECTOR vD = XMVectorSplatW( Plane ) - XMVector3Dot( vNormal, Translation ); + + return XMVectorInsert<0, 0, 0, 0, 1>( vNormal, vD ); +} + +//----------------------------------------------------------------------------- +// Return the point on the line segement (S1, S2) nearest the point P. +//----------------------------------------------------------------------------- +inline XMVECTOR PointOnLineSegmentNearestPoint( _In_ FXMVECTOR S1, _In_ FXMVECTOR S2, _In_ FXMVECTOR P ) +{ + XMVECTOR Dir = S2 - S1; + XMVECTOR Projection = ( XMVector3Dot( P, Dir ) - XMVector3Dot( S1, Dir ) ); + XMVECTOR LengthSq = XMVector3Dot( Dir, Dir ); + + XMVECTOR t = Projection * XMVectorReciprocal( LengthSq ); + XMVECTOR Point = S1 + t * Dir; + + // t < 0 + XMVECTOR SelectS1 = XMVectorLess( Projection, XMVectorZero() ); + Point = XMVectorSelect( Point, S1, SelectS1 ); + + // t > 1 + XMVECTOR SelectS2 = XMVectorGreater( Projection, LengthSq ); + Point = XMVectorSelect( Point, S2, SelectS2 ); + + return Point; +} + +//----------------------------------------------------------------------------- +// Test if the point (P) on the plane of the triangle is inside the triangle +// (V0, V1, V2). +//----------------------------------------------------------------------------- +inline XMVECTOR PointOnPlaneInsideTriangle( _In_ FXMVECTOR P, _In_ FXMVECTOR V0, _In_ FXMVECTOR V1, _In_ GXMVECTOR V2 ) +{ + // Compute the triangle normal. + XMVECTOR N = XMVector3Cross( V2 - V0, V1 - V0 ); + + // Compute the cross products of the vector from the base of each edge to + // the point with each edge vector. + XMVECTOR C0 = XMVector3Cross( P - V0, V1 - V0 ); + XMVECTOR C1 = XMVector3Cross( P - V1, V2 - V1 ); + XMVECTOR C2 = XMVector3Cross( P - V2, V0 - V2 ); + + // If the cross product points in the same direction as the normal the the + // point is inside the edge (it is zero if is on the edge). + XMVECTOR Zero = XMVectorZero(); + XMVECTOR Inside0 = XMVectorGreaterOrEqual( XMVector3Dot( C0, N ), Zero ); + XMVECTOR Inside1 = XMVectorGreaterOrEqual( XMVector3Dot( C1, N ), Zero ); + XMVECTOR Inside2 = XMVectorGreaterOrEqual( XMVector3Dot( C2, N ), Zero ); + + // If the point inside all of the edges it is inside. + return XMVectorAndInt( XMVectorAndInt( Inside0, Inside1 ), Inside2 ); +} + +//----------------------------------------------------------------------------- +inline bool SolveCubic( _In_ float e, _In_ float f, _In_ float g, _Out_ float* t, _Out_ float* u, _Out_ float* v ) +{ + float p, q, h, rc, d, theta, costh3, sinth3; + + p = f - e * e / 3.0f; + q = g - e * f / 3.0f + e * e * e * 2.0f / 27.0f; + h = q * q / 4.0f + p * p * p / 27.0f; + + if( h > 0.0 ) + { + *t = *u = *v = 0.f; + return false; // only one real root + } + + if( ( h == 0.0 ) && ( q == 0.0 ) ) // all the same root + { + *t = - e / 3; + *u = - e / 3; + *v = - e / 3; + + return true; + } + + d = sqrtf( q * q / 4.0f - h ); + if( d < 0 ) + rc = -powf( -d, 1.0f / 3.0f ); + else + rc = powf( d, 1.0f / 3.0f ); + + theta = XMScalarACos( -q / ( 2.0f * d ) ); + costh3 = XMScalarCos( theta / 3.0f ); + sinth3 = sqrtf( 3.0f ) * XMScalarSin( theta / 3.0f ); + *t = 2.0f * rc * costh3 - e / 3.0f; + *u = -rc * ( costh3 + sinth3 ) - e / 3.0f; + *v = -rc * ( costh3 - sinth3 ) - e / 3.0f; + + return true; +} + +//----------------------------------------------------------------------------- +inline XMVECTOR CalculateEigenVector( _In_ float m11, _In_ float m12, _In_ float m13, + _In_ float m22, _In_ float m23, _In_ float m33, _In_ float e ) +{ + float fTmp[3]; + fTmp[0] = ( float )( m12 * m23 - m13 * ( m22 - e ) ); + fTmp[1] = ( float )( m13 * m12 - m23 * ( m11 - e ) ); + fTmp[2] = ( float )( ( m11 - e ) * ( m22 - e ) - m12 * m12 ); + + XMVECTOR vTmp = XMLoadFloat3( (XMFLOAT3*)fTmp ); + + if( XMVector3Equal( vTmp, XMVectorZero() ) ) // planar or linear + { + float f1, f2, f3; + + // we only have one equation - find a valid one + if( ( m11 - e != 0.0 ) || ( m12 != 0.0 ) || ( m13 != 0.0 ) ) + { + f1 = m11 - e; f2 = m12; f3 = m13; + } + else if( ( m12 != 0.0 ) || ( m22 - e != 0.0 ) || ( m23 != 0.0 ) ) + { + f1 = m12; f2 = m22 - e; f3 = m23; + } + else if( ( m13 != 0.0 ) || ( m23 != 0.0 ) || ( m33 - e != 0.0 ) ) + { + f1 = m13; f2 = m23; f3 = m33 - e; + } + else + { + // error, we'll just make something up - we have NO context + f1 = 1.0; f2 = 0.0; f3 = 0.0; + } + + if( f1 == 0.0 ) + vTmp = XMVectorSetX( vTmp, 0.0f ); + else + vTmp = XMVectorSetX( vTmp, 1.0f ); + + if( f2 == 0.0 ) + vTmp = XMVectorSetY( vTmp, 0.0f ); + else + vTmp = XMVectorSetY( vTmp, 1.0f ); + + if( f3 == 0.0 ) + { + vTmp = XMVectorSetZ( vTmp, 0.0f ); + // recalculate y to make equation work + if( m12 != 0.0 ) + vTmp = XMVectorSetY( vTmp, ( float )( -f1 / f2 ) ); + } + else + { + vTmp = XMVectorSetZ( vTmp, ( float )( ( f2 - f1 ) / f3 ) ); + } + } + + if( XMVectorGetX( XMVector3LengthSq( vTmp ) ) > 1e-5f ) + { + return XMVector3Normalize( vTmp ); + } + else + { + // Multiply by a value large enough to make the vector non-zero. + vTmp *= 1e5f; + return XMVector3Normalize( vTmp ); + } +} + +//----------------------------------------------------------------------------- +inline bool CalculateEigenVectors( _In_ float m11, _In_ float m12, _In_ float m13, + _In_ float m22, _In_ float m23, _In_ float m33, + _In_ float e1, _In_ float e2, _In_ float e3, + _Out_ XMVECTOR* pV1, _Out_ XMVECTOR* pV2, _Out_ XMVECTOR* pV3 ) +{ + *pV1 = DirectX::Internal::CalculateEigenVector( m11, m12, m13, m22, m23, m33, e1 ); + *pV2 = DirectX::Internal::CalculateEigenVector( m11, m12, m13, m22, m23, m33, e2 ); + *pV3 = DirectX::Internal::CalculateEigenVector( m11, m12, m13, m22, m23, m33, e3 ); + + bool v1z = false; + bool v2z = false; + bool v3z = false; + + XMVECTOR Zero = XMVectorZero(); + + if ( XMVector3Equal( *pV1, Zero ) ) + v1z = true; + + if ( XMVector3Equal( *pV2, Zero ) ) + v2z = true; + + if ( XMVector3Equal( *pV3, Zero )) + v3z = true; + + bool e12 = ( fabsf( XMVectorGetX( XMVector3Dot( *pV1, *pV2 ) ) ) > 0.1f ); // check for non-orthogonal vectors + bool e13 = ( fabsf( XMVectorGetX( XMVector3Dot( *pV1, *pV3 ) ) ) > 0.1f ); + bool e23 = ( fabsf( XMVectorGetX( XMVector3Dot( *pV2, *pV3 ) ) ) > 0.1f ); + + if( ( v1z && v2z && v3z ) || ( e12 && e13 && e23 ) || + ( e12 && v3z ) || ( e13 && v2z ) || ( e23 && v1z ) ) // all eigenvectors are 0- any basis set + { + *pV1 = g_XMIdentityR0.v; + *pV2 = g_XMIdentityR1.v; + *pV3 = g_XMIdentityR2.v; + return true; + } + + if( v1z && v2z ) + { + XMVECTOR vTmp = XMVector3Cross( g_XMIdentityR1, *pV3 ); + if( XMVectorGetX( XMVector3LengthSq( vTmp ) ) < 1e-5f ) + { + vTmp = XMVector3Cross( g_XMIdentityR0, *pV3 ); + } + *pV1 = XMVector3Normalize( vTmp ); + *pV2 = XMVector3Cross( *pV3, *pV1 ); + return true; + } + + if( v3z && v1z ) + { + XMVECTOR vTmp = XMVector3Cross( g_XMIdentityR1, *pV2 ); + if( XMVectorGetX( XMVector3LengthSq( vTmp ) ) < 1e-5f ) + { + vTmp = XMVector3Cross( g_XMIdentityR0, *pV2 ); + } + *pV3 = XMVector3Normalize( vTmp ); + *pV1 = XMVector3Cross( *pV2, *pV3 ); + return true; + } + + if( v2z && v3z ) + { + XMVECTOR vTmp = XMVector3Cross( g_XMIdentityR1, *pV1 ); + if( XMVectorGetX( XMVector3LengthSq( vTmp ) ) < 1e-5f ) + { + vTmp = XMVector3Cross( g_XMIdentityR0, *pV1 ); + } + *pV2 = XMVector3Normalize( vTmp ); + *pV3 = XMVector3Cross( *pV1, *pV2 ); + return true; + } + + if( ( v1z ) || e12 ) + { + *pV1 = XMVector3Cross( *pV2, *pV3 ); + return true; + } + + if( ( v2z ) || e23 ) + { + *pV2 = XMVector3Cross( *pV3, *pV1 ); + return true; + } + + if( ( v3z ) || e13 ) + { + *pV3 = XMVector3Cross( *pV1, *pV2 ); + return true; + } + + return true; +} + +//----------------------------------------------------------------------------- +inline bool CalculateEigenVectorsFromCovarianceMatrix( _In_ float Cxx, _In_ float Cyy, _In_ float Czz, + _In_ float Cxy, _In_ float Cxz, _In_ float Cyz, + _Out_ XMVECTOR* pV1, _Out_ XMVECTOR* pV2, _Out_ XMVECTOR* pV3 ) +{ + // Calculate the eigenvalues by solving a cubic equation. + float e = -( Cxx + Cyy + Czz ); + float f = Cxx * Cyy + Cyy * Czz + Czz * Cxx - Cxy * Cxy - Cxz * Cxz - Cyz * Cyz; + float g = Cxy * Cxy * Czz + Cxz * Cxz * Cyy + Cyz * Cyz * Cxx - Cxy * Cyz * Cxz * 2.0f - Cxx * Cyy * Czz; + + float ev1, ev2, ev3; + if( !DirectX::Internal::SolveCubic( e, f, g, &ev1, &ev2, &ev3 ) ) + { + // set them to arbitrary orthonormal basis set + *pV1 = g_XMIdentityR0.v; + *pV2 = g_XMIdentityR1.v; + *pV3 = g_XMIdentityR2.v; + return false; + } + + return DirectX::Internal::CalculateEigenVectors( Cxx, Cxy, Cxz, Cyy, Cyz, Czz, ev1, ev2, ev3, pV1, pV2, pV3 ); +} + +//----------------------------------------------------------------------------- +inline void FastIntersectTrianglePlane( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2, GXMVECTOR Plane, + XMVECTOR& Outside, XMVECTOR& Inside ) +{ + // Plane0 + XMVECTOR Dist0 = XMVector4Dot( V0, Plane ); + XMVECTOR Dist1 = XMVector4Dot( V1, Plane ); + XMVECTOR Dist2 = XMVector4Dot( V2, Plane ); + + XMVECTOR MinDist = XMVectorMin( Dist0, Dist1 ); + MinDist = XMVectorMin( MinDist, Dist2 ); + + XMVECTOR MaxDist = XMVectorMax( Dist0, Dist1 ); + MaxDist = XMVectorMax( MaxDist, Dist2 ); + + XMVECTOR Zero = XMVectorZero(); + + // Outside the plane? + Outside = XMVectorGreater( MinDist, Zero ); + + // Fully inside the plane? + Inside = XMVectorLess( MaxDist, Zero ); +} + +//----------------------------------------------------------------------------- +inline void FastIntersectSpherePlane( _In_ FXMVECTOR Center, _In_ FXMVECTOR Radius, _In_ FXMVECTOR Plane, + _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside ) +{ + XMVECTOR Dist = XMVector4Dot( Center, Plane ); + + // Outside the plane? + Outside = XMVectorGreater( Dist, Radius ); + + // Fully inside the plane? + Inside = XMVectorLess( Dist, -Radius ); +} + +//----------------------------------------------------------------------------- +inline void FastIntersectAxisAlignedBoxPlane( _In_ FXMVECTOR Center, _In_ FXMVECTOR Extents, _In_ FXMVECTOR Plane, + _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside ) +{ + // Compute the distance to the center of the box. + XMVECTOR Dist = XMVector4Dot( Center, Plane ); + + // Project the axes of the box onto the normal of the plane. Half the + // length of the projection (sometime called the "radius") is equal to + // h(u) * abs(n dot b(u))) + h(v) * abs(n dot b(v)) + h(w) * abs(n dot b(w)) + // where h(i) are extents of the box, n is the plane normal, and b(i) are the + // axes of the box. In this case b(i) = [(1,0,0), (0,1,0), (0,0,1)]. + XMVECTOR Radius = XMVector3Dot( Extents, XMVectorAbs( Plane ) ); + + // Outside the plane? + Outside = XMVectorGreater( Dist, Radius ); + + // Fully inside the plane? + Inside = XMVectorLess( Dist, -Radius ); +} + +//----------------------------------------------------------------------------- +inline void FastIntersectOrientedBoxPlane( _In_ FXMVECTOR Center, _In_ FXMVECTOR Extents, _In_ FXMVECTOR Axis0, _In_ GXMVECTOR Axis1, + _In_ CXMVECTOR Axis2, _In_ CXMVECTOR Plane, _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside ) +{ + // Compute the distance to the center of the box. + XMVECTOR Dist = XMVector4Dot( Center, Plane ); + + // Project the axes of the box onto the normal of the plane. Half the + // length of the projection (sometime called the "radius") is equal to + // h(u) * abs(n dot b(u))) + h(v) * abs(n dot b(v)) + h(w) * abs(n dot b(w)) + // where h(i) are extents of the box, n is the plane normal, and b(i) are the + // axes of the box. + XMVECTOR Radius = XMVector3Dot( Plane, Axis0 ); + Radius = XMVectorInsert<0, 0, 1, 0, 0>( Radius, XMVector3Dot( Plane, Axis1 ) ); + Radius = XMVectorInsert<0, 0, 0, 1, 0>( Radius, XMVector3Dot( Plane, Axis2 ) ); + Radius = XMVector3Dot( Extents, XMVectorAbs( Radius ) ); + + // Outside the plane? + Outside = XMVectorGreater( Dist, Radius ); + + // Fully inside the plane? + Inside = XMVectorLess( Dist, -Radius ); +} + +//----------------------------------------------------------------------------- +inline void FastIntersectFrustumPlane( _In_ FXMVECTOR Point0, _In_ FXMVECTOR Point1, _In_ FXMVECTOR Point2, _In_ GXMVECTOR Point3, + _In_ CXMVECTOR Point4, _In_ CXMVECTOR Point5, _In_ CXMVECTOR Point6, _In_ CXMVECTOR Point7, + _In_ CXMVECTOR Plane, _Out_ XMVECTOR& Outside, _Out_ XMVECTOR& Inside ) +{ + // Find the min/max projection of the frustum onto the plane normal. + XMVECTOR Min, Max, Dist; + + Min = Max = XMVector3Dot( Plane, Point0 ); + + Dist = XMVector3Dot( Plane, Point1 ); + Min = XMVectorMin( Min, Dist ); + Max = XMVectorMax( Max, Dist ); + + Dist = XMVector3Dot( Plane, Point2 ); + Min = XMVectorMin( Min, Dist ); + Max = XMVectorMax( Max, Dist ); + + Dist = XMVector3Dot( Plane, Point3 ); + Min = XMVectorMin( Min, Dist ); + Max = XMVectorMax( Max, Dist ); + + Dist = XMVector3Dot( Plane, Point4 ); + Min = XMVectorMin( Min, Dist ); + Max = XMVectorMax( Max, Dist ); + + Dist = XMVector3Dot( Plane, Point5 ); + Min = XMVectorMin( Min, Dist ); + Max = XMVectorMax( Max, Dist ); + + Dist = XMVector3Dot( Plane, Point6 ); + Min = XMVectorMin( Min, Dist ); + Max = XMVectorMax( Max, Dist ); + + Dist = XMVector3Dot( Plane, Point7 ); + Min = XMVectorMin( Min, Dist ); + Max = XMVectorMax( Max, Dist ); + + XMVECTOR PlaneDist = -XMVectorSplatW( Plane ); + + // Outside the plane? + Outside = XMVectorGreater( Min, PlaneDist ); + + // Fully inside the plane? + Inside = XMVectorLess( Max, PlaneDist ); +} + +}; // namespace Internal + + +/**************************************************************************** + * + * BoundingSphere + * + ****************************************************************************/ + +//----------------------------------------------------------------------------- +// Transform a sphere by an angle preserving transform. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingSphere::Transform( BoundingSphere& Out, CXMMATRIX M ) const +{ + // Load the center of the sphere. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + + // Transform the center of the sphere. + XMVECTOR C = XMVector3Transform( vCenter, M ); + + XMVECTOR dX = XMVector3Dot( M.r[0], M.r[0] ); + XMVECTOR dY = XMVector3Dot( M.r[1], M.r[1] ); + XMVECTOR dZ = XMVector3Dot( M.r[2], M.r[2] ); + + XMVECTOR d = XMVectorMax( dX, XMVectorMax( dY, dZ ) ); + + // Store the center sphere. + XMStoreFloat3( &Out.Center, C ); + + // Scale the radius of the pshere. + float Scale = sqrtf( XMVectorGetX(d) ); + Out.Radius = Radius * Scale; +} + +_Use_decl_annotations_ +inline void BoundingSphere::Transform( BoundingSphere& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation ) const +{ + // Load the center of the sphere. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + + // Transform the center of the sphere. + vCenter = XMVector3Rotate( vCenter * XMVectorReplicate( Scale ), Rotation ) + Translation; + + // Store the center sphere. + XMStoreFloat3( &Out.Center, vCenter ); + + // Scale the radius of the pshere. + Out.Radius = Radius * Scale; +} + + +//----------------------------------------------------------------------------- +// Point in sphere test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingSphere::Contains( FXMVECTOR Point ) const +{ + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); + + XMVECTOR DistanceSquared = XMVector3LengthSq( Point - vCenter ); + XMVECTOR RadiusSquared = XMVectorMultiply( vRadius, vRadius ); + + return XMVector3LessOrEqual( DistanceSquared, RadiusSquared ) ? CONTAINS : DISJOINT; +} + + +//----------------------------------------------------------------------------- +// Triangle in sphere test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingSphere::Contains( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const +{ + if ( !Intersects(V0,V1,V2) ) + return DISJOINT; + + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); + XMVECTOR RadiusSquared = XMVectorMultiply( vRadius, vRadius ); + + XMVECTOR DistanceSquared = XMVector3LengthSq( V0 - vCenter ); + XMVECTOR Inside = XMVectorLessOrEqual(DistanceSquared, RadiusSquared); + + DistanceSquared = XMVector3LengthSq( V1 - vCenter ); + Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual(DistanceSquared, RadiusSquared) ); + + DistanceSquared = XMVector3LengthSq( V2 - vCenter ); + Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual(DistanceSquared, RadiusSquared) ); + + return ( XMVector3EqualInt( Inside, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Sphere in sphere test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingSphere::Contains( const BoundingSphere& sh ) const +{ + XMVECTOR Center1 = XMLoadFloat3( &Center ); + float r1 = Radius; + + XMVECTOR Center2 = XMLoadFloat3( &sh.Center ); + float r2 = sh.Radius; + + XMVECTOR V = XMVectorSubtract( Center2, Center1 ); + + XMVECTOR Dist = XMVector3Length( V ); + + float d = XMVectorGetX( Dist ); + + return (r1 + r2 >= d) ? ((r1 - r2 >= d) ? CONTAINS : INTERSECTS) : DISJOINT; +} + + +//----------------------------------------------------------------------------- +// Axis-aligned box in sphere test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingSphere::Contains( const BoundingBox& box ) const +{ + if ( !box.Intersects(*this) ) + return DISJOINT; + + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); + XMVECTOR RadiusSq = vRadius * vRadius; + + XMVECTOR boxCenter = XMLoadFloat3( &box.Center ); + XMVECTOR boxExtents = XMLoadFloat3( &box.Extents ); + + XMVECTOR InsideAll = XMVectorTrueInt(); + + XMVECTOR offset = boxCenter - vCenter; + + for( size_t i = 0; i < BoundingBox::CORNER_COUNT; ++i ) + { + XMVECTOR C = XMVectorMultiplyAdd( boxExtents, g_BoxOffset[i], offset ); + XMVECTOR d = XMVector3LengthSq( C ); + InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( d, RadiusSq ) ); + } + + return ( XMVector3EqualInt( InsideAll, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Oriented box in sphere test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingSphere::Contains( const BoundingOrientedBox& box ) const +{ + if ( !box.Intersects(*this) ) + return DISJOINT; + + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); + XMVECTOR RadiusSq = vRadius * vRadius; + + XMVECTOR boxCenter = XMLoadFloat3( &box.Center ); + XMVECTOR boxExtents = XMLoadFloat3( &box.Extents ); + XMVECTOR boxOrientation = XMLoadFloat4( &box.Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( boxOrientation ) ); + + XMVECTOR InsideAll = XMVectorTrueInt(); + + for( size_t i = 0; i < BoundingOrientedBox::CORNER_COUNT; ++i ) + { + XMVECTOR C = XMVector3Rotate( boxExtents * g_BoxOffset[i], boxOrientation ) + boxCenter; + XMVECTOR d = XMVector3LengthSq( XMVectorSubtract( vCenter, C ) ); + InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( d, RadiusSq ) ); + } + + return ( XMVector3EqualInt( InsideAll, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; + +} + + +//----------------------------------------------------------------------------- +// Frustum in sphere test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingSphere::Contains( const BoundingFrustum& fr ) const +{ + if ( !fr.Intersects(*this) ) + return DISJOINT; + + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); + XMVECTOR RadiusSq = vRadius * vRadius; + + XMVECTOR vOrigin = XMLoadFloat3( &fr.Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &fr.Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Build the corners of the frustum. + XMVECTOR vRightTop = XMVectorSet( fr.RightSlope, fr.TopSlope, 1.0f, 0.0f ); + XMVECTOR vRightBottom = XMVectorSet( fr.RightSlope, fr.BottomSlope, 1.0f, 0.0f ); + XMVECTOR vLeftTop = XMVectorSet( fr.LeftSlope, fr.TopSlope, 1.0f, 0.0f ); + XMVECTOR vLeftBottom = XMVectorSet( fr.LeftSlope, fr.BottomSlope, 1.0f, 0.0f ); + XMVECTOR vNear = XMVectorReplicatePtr( &fr.Near ); + XMVECTOR vFar = XMVectorReplicatePtr( &fr.Far ); + + XMVECTOR Corners[BoundingFrustum::CORNER_COUNT]; + Corners[0] = vRightTop * vNear; + Corners[1] = vRightBottom * vNear; + Corners[2] = vLeftTop * vNear; + Corners[3] = vLeftBottom * vNear; + Corners[4] = vRightTop * vFar; + Corners[5] = vRightBottom * vFar; + Corners[6] = vLeftTop * vFar; + Corners[7] = vLeftBottom * vFar; + + XMVECTOR InsideAll = XMVectorTrueInt(); + for( size_t i = 0; i < BoundingFrustum::CORNER_COUNT; ++i ) + { + XMVECTOR C = XMVector3Rotate( Corners[i], vOrientation ) + vOrigin; + XMVECTOR d = XMVector3LengthSq( XMVectorSubtract( vCenter, C ) ); + InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( d, RadiusSq ) ); + } + + return ( XMVector3EqualInt( InsideAll, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Sphere vs. sphere test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingSphere::Intersects( const BoundingSphere& sh ) const +{ + // Load A. + XMVECTOR vCenterA = XMLoadFloat3( &Center ); + XMVECTOR vRadiusA = XMVectorReplicatePtr( &Radius ); + + // Load B. + XMVECTOR vCenterB = XMLoadFloat3( &sh.Center ); + XMVECTOR vRadiusB = XMVectorReplicatePtr( &sh.Radius ); + + // Distance squared between centers. + XMVECTOR Delta = vCenterB - vCenterA; + XMVECTOR DistanceSquared = XMVector3LengthSq( Delta ); + + // Sum of the radii squared. + XMVECTOR RadiusSquared = XMVectorAdd( vRadiusA, vRadiusB ); + RadiusSquared = XMVectorMultiply( RadiusSquared, RadiusSquared ); + + return XMVector3LessOrEqual( DistanceSquared, RadiusSquared ); +} + + +//----------------------------------------------------------------------------- +// Box vs. sphere test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingSphere::Intersects( const BoundingBox& box ) const +{ + return box.Intersects( *this ); +} + +_Use_decl_annotations_ +inline bool BoundingSphere::Intersects( const BoundingOrientedBox& box ) const +{ + return box.Intersects( *this ); +} + + +//----------------------------------------------------------------------------- +// Frustum vs. sphere test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingSphere::Intersects( const BoundingFrustum& fr ) const +{ + return fr.Intersects( *this ); +} + + +//----------------------------------------------------------------------------- +// Triangle vs sphere test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingSphere::Intersects( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const +{ + // Load the sphere. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); + + // Compute the plane of the triangle (has to be normalized). + XMVECTOR N = XMVector3Normalize( XMVector3Cross( V1 - V0, V2 - V0 ) ); + + // Assert that the triangle is not degenerate. + assert( !XMVector3Equal( N, XMVectorZero() ) ); + + // Find the nearest feature on the triangle to the sphere. + XMVECTOR Dist = XMVector3Dot( vCenter - V0, N ); + + // If the center of the sphere is farther from the plane of the triangle than + // the radius of the sphere, then there cannot be an intersection. + XMVECTOR NoIntersection = XMVectorLess( Dist, -vRadius ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Dist, vRadius ) ); + + // Project the center of the sphere onto the plane of the triangle. + XMVECTOR Point = vCenter - ( N * Dist ); + + // Is it inside all the edges? If so we intersect because the distance + // to the plane is less than the radius. + XMVECTOR Intersection = DirectX::Internal::PointOnPlaneInsideTriangle( Point, V0, V1, V2 ); + + // Find the nearest point on each edge. + XMVECTOR RadiusSq = vRadius * vRadius; + + // Edge 0,1 + Point = DirectX::Internal::PointOnLineSegmentNearestPoint( V0, V1, vCenter ); + + // If the distance to the center of the sphere to the point is less than + // the radius of the sphere then it must intersect. + Intersection = XMVectorOrInt( Intersection, XMVectorLessOrEqual( XMVector3LengthSq( vCenter - Point ), RadiusSq ) ); + + // Edge 1,2 + Point = DirectX::Internal::PointOnLineSegmentNearestPoint( V1, V2, vCenter ); + + // If the distance to the center of the sphere to the point is less than + // the radius of the sphere then it must intersect. + Intersection = XMVectorOrInt( Intersection, XMVectorLessOrEqual( XMVector3LengthSq( vCenter - Point ), RadiusSq ) ); + + // Edge 2,0 + Point = DirectX::Internal::PointOnLineSegmentNearestPoint( V2, V0, vCenter ); + + // If the distance to the center of the sphere to the point is less than + // the radius of the sphere then it must intersect. + Intersection = XMVectorOrInt( Intersection, XMVectorLessOrEqual( XMVector3LengthSq( vCenter - Point ), RadiusSq ) ); + + return XMVector4EqualInt( XMVectorAndCInt( Intersection, NoIntersection ), XMVectorTrueInt() ); +} + + +//----------------------------------------------------------------------------- +// Sphere-plane intersection +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline PlaneIntersectionType BoundingSphere::Intersects( FXMVECTOR Plane ) const +{ + assert( DirectX::Internal::XMPlaneIsUnit( Plane ) ); + + // Load the sphere. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); + + // Set w of the center to one so we can dot4 with a plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() ); + + XMVECTOR Outside, Inside; + DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane, Outside, Inside ); + + // If the sphere is outside any plane it is outside. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return FRONT; + + // If the sphere is inside all planes it is inside. + if ( XMVector4EqualInt( Inside, XMVectorTrueInt() ) ) + return BACK; + + // The sphere is not inside all planes or outside a plane it intersects. + return INTERSECTING; +} + + +//----------------------------------------------------------------------------- +// Compute the intersection of a ray (Origin, Direction) with a sphere. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingSphere::Intersects( FXMVECTOR Origin, FXMVECTOR Direction, float& Dist ) const +{ + assert( DirectX::Internal::XMVector3IsUnit( Direction ) ); + + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); + + // l is the vector from the ray origin to the center of the sphere. + XMVECTOR l = vCenter - Origin; + + // s is the projection of the l onto the ray direction. + XMVECTOR s = XMVector3Dot( l, Direction ); + + XMVECTOR l2 = XMVector3Dot( l, l ); + + XMVECTOR r2 = vRadius * vRadius; + + // m2 is squared distance from the center of the sphere to the projection. + XMVECTOR m2 = l2 - s * s; + + XMVECTOR NoIntersection; + + // If the ray origin is outside the sphere and the center of the sphere is + // behind the ray origin there is no intersection. + NoIntersection = XMVectorAndInt( XMVectorLess( s, XMVectorZero() ), XMVectorGreater( l2, r2 ) ); + + // If the squared distance from the center of the sphere to the projection + // is greater than the radius squared the ray will miss the sphere. + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( m2, r2 ) ); + + // The ray hits the sphere, compute the nearest intersection point. + XMVECTOR q = XMVectorSqrt( r2 - m2 ); + XMVECTOR t1 = s - q; + XMVECTOR t2 = s + q; + + XMVECTOR OriginInside = XMVectorLessOrEqual( l2, r2 ); + XMVECTOR t = XMVectorSelect( t1, t2, OriginInside ); + + if( XMVector4NotEqualInt( NoIntersection, XMVectorTrueInt() ) ) + { + // Store the x-component to *pDist. + XMStoreFloat( &Dist, t ); + return true; + } + + Dist = 0.f; + return false; +} + + +//----------------------------------------------------------------------------- +// Test a sphere vs 6 planes (typically forming a frustum). +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingSphere::ContainedBy( FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2, + GXMVECTOR Plane3, CXMVECTOR Plane4, CXMVECTOR Plane5 ) const +{ + // Load the sphere. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vRadius = XMVectorReplicatePtr( &Radius ); + + // Set w of the center to one so we can dot4 with a plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() ); + + XMVECTOR Outside, Inside; + + // Test against each plane. + DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane0, Outside, Inside ); + + XMVECTOR AnyOutside = Outside; + XMVECTOR AllInside = Inside; + + DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane1, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane2, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane3, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane4, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectSpherePlane( vCenter, vRadius, Plane5, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + // If the sphere is outside any plane it is outside. + if ( XMVector4EqualInt( AnyOutside, XMVectorTrueInt() ) ) + return DISJOINT; + + // If the sphere is inside all planes it is inside. + if ( XMVector4EqualInt( AllInside, XMVectorTrueInt() ) ) + return CONTAINS; + + // The sphere is not inside all planes or outside a plane, it may intersect. + return INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Creates a bounding sphere that contains two other bounding spheres +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingSphere::CreateMerged( BoundingSphere& Out, const BoundingSphere& S1, const BoundingSphere& S2 ) +{ + XMVECTOR Center1 = XMLoadFloat3( &S1.Center ); + float r1 = S1.Radius; + + XMVECTOR Center2 = XMLoadFloat3( &S2.Center ); + float r2 = S2.Radius; + + XMVECTOR V = XMVectorSubtract( Center2, Center1 ); + + XMVECTOR Dist = XMVector3Length( V ); + + float d = XMVectorGetX(Dist); + + if ( r1 + r2 >= d ) + { + if ( r1 - r2 >= d ) + { + Out = S1; + return; + } + else if ( r2 - r1 >= d ) + { + Out = S2; + return; + } + } + + XMVECTOR N = XMVectorDivide( V, Dist ); + + float t1 = XMMin( -r1, d-r2 ); + float t2 = XMMax( r1, d+r2 ); + float t_5 = (t2 - t1) * 0.5f; + + XMVECTOR NCenter = XMVectorAdd( Center1, XMVectorMultiply( N, XMVectorReplicate( t_5 + t1 ) ) ); + + XMStoreFloat3( &Out.Center, NCenter ); + Out.Radius = t_5; +} + + +//----------------------------------------------------------------------------- +// Create sphere enscribing bounding box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingSphere::CreateFromBoundingBox( BoundingSphere& Out, const BoundingBox& box ) +{ + Out.Center = box.Center; + XMVECTOR vExtents = XMLoadFloat3( &box.Extents ); + Out.Radius = XMVectorGetX( XMVector3Length( vExtents ) ); +} + +_Use_decl_annotations_ +inline void BoundingSphere::CreateFromBoundingBox( BoundingSphere& Out, const BoundingOrientedBox& box ) +{ + // Bounding box orientation is irrelevant because a sphere is rotationally invariant + Out.Center = box.Center; + XMVECTOR vExtents = XMLoadFloat3( &box.Extents ); + Out.Radius = XMVectorGetX( XMVector3Length( vExtents ) ); +} + + +//----------------------------------------------------------------------------- +// Find the approximate smallest enclosing bounding sphere for a set of +// points. Exact computation of the smallest enclosing bounding sphere is +// possible but is slower and requires a more complex algorithm. +// The algorithm is based on Jack Ritter, "An Efficient Bounding Sphere", +// Graphics Gems. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingSphere::CreateFromPoints( BoundingSphere& Out, size_t Count, const XMFLOAT3* pPoints, size_t Stride ) +{ + assert( Count > 0 ); + assert( pPoints ); + + // Find the points with minimum and maximum x, y, and z + XMVECTOR MinX, MaxX, MinY, MaxY, MinZ, MaxZ; + + MinX = MaxX = MinY = MaxY = MinZ = MaxZ = XMLoadFloat3( pPoints ); + + for( size_t i = 1; i < Count; ++i ) + { + XMVECTOR Point = XMLoadFloat3( reinterpret_cast<const XMFLOAT3*>( reinterpret_cast<const uint8_t*>(pPoints) + i * Stride ) ); + + float px = XMVectorGetX( Point ); + float py = XMVectorGetY( Point ); + float pz = XMVectorGetZ( Point ); + + if( px < XMVectorGetX( MinX ) ) + MinX = Point; + + if( px > XMVectorGetX( MaxX ) ) + MaxX = Point; + + if( py < XMVectorGetY( MinY ) ) + MinY = Point; + + if( py > XMVectorGetY( MaxY ) ) + MaxY = Point; + + if( pz < XMVectorGetZ( MinZ ) ) + MinZ = Point; + + if( pz > XMVectorGetZ( MaxZ ) ) + MaxZ = Point; + } + + // Use the min/max pair that are farthest apart to form the initial sphere. + XMVECTOR DeltaX = MaxX - MinX; + XMVECTOR DistX = XMVector3Length( DeltaX ); + + XMVECTOR DeltaY = MaxY - MinY; + XMVECTOR DistY = XMVector3Length( DeltaY ); + + XMVECTOR DeltaZ = MaxZ - MinZ; + XMVECTOR DistZ = XMVector3Length( DeltaZ ); + + XMVECTOR vCenter; + XMVECTOR vRadius; + + if( XMVector3Greater( DistX, DistY ) ) + { + if( XMVector3Greater( DistX, DistZ ) ) + { + // Use min/max x. + vCenter = XMVectorLerp(MaxX,MinX,0.5f); + vRadius = DistX * 0.5f; + } + else + { + // Use min/max z. + vCenter = XMVectorLerp(MaxZ,MinZ,0.5f); + vRadius = DistZ * 0.5f; + } + } + else // Y >= X + { + if( XMVector3Greater( DistY, DistZ ) ) + { + // Use min/max y. + vCenter = XMVectorLerp(MaxY,MinY,0.5f); + vRadius = DistY * 0.5f; + } + else + { + // Use min/max z. + vCenter = XMVectorLerp(MaxZ,MinZ,0.5f); + vRadius = DistZ * 0.5f; + } + } + + // Add any points not inside the sphere. + for( size_t i = 0; i < Count; ++i ) + { + XMVECTOR Point = XMLoadFloat3( reinterpret_cast<const XMFLOAT3*>( reinterpret_cast<const uint8_t*>(pPoints) + i * Stride ) ); + + XMVECTOR Delta = Point - vCenter; + + XMVECTOR Dist = XMVector3Length( Delta ); + + if( XMVector3Greater( Dist, vRadius ) ) + { + // Adjust sphere to include the new point. + vRadius = ( vRadius + Dist ) * 0.5f; + vCenter += ( XMVectorReplicate( 1.0f ) - XMVectorDivide(vRadius,Dist) ) * Delta; + } + } + + XMStoreFloat3( &Out.Center, vCenter ); + XMStoreFloat( &Out.Radius, vRadius ); +} + + +//----------------------------------------------------------------------------- +// Create sphere containing frustum +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingSphere::CreateFromFrustum( BoundingSphere& Out, const BoundingFrustum& fr ) +{ + XMFLOAT3 Corners[BoundingFrustum::CORNER_COUNT]; + fr.GetCorners( Corners ); + CreateFromPoints( Out, BoundingFrustum::CORNER_COUNT, Corners, sizeof(XMFLOAT3) ); +} + + +/**************************************************************************** + * + * BoundingBox + * + ****************************************************************************/ + +//----------------------------------------------------------------------------- +// Transform an axis aligned box by an angle preserving transform. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingBox::Transform( BoundingBox& Out, CXMMATRIX M ) const +{ + // Load center and extents. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + // Compute and transform the corners and find new min/max bounds. + XMVECTOR Corner = XMVectorMultiplyAdd( vExtents, g_BoxOffset[0], vCenter ); + Corner = XMVector3Transform( Corner, M ); + + XMVECTOR Min, Max; + Min = Max = Corner; + + for( size_t i = 1; i < CORNER_COUNT; ++i ) + { + Corner = XMVectorMultiplyAdd( vExtents, g_BoxOffset[i], vCenter ); + Corner = XMVector3Transform( Corner, M ); + + Min = XMVectorMin( Min, Corner ); + Max = XMVectorMax( Max, Corner ); + } + + // Store center and extents. + XMStoreFloat3( &Out.Center, ( Min + Max ) * 0.5f ); + XMStoreFloat3( &Out.Extents, ( Max - Min ) * 0.5f ); +} + +_Use_decl_annotations_ +inline void BoundingBox::Transform( BoundingBox& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation ) const +{ + assert( DirectX::Internal::XMQuaternionIsUnit( Rotation ) ); + + // Load center and extents. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + XMVECTOR VectorScale = XMVectorReplicate( Scale ); + + // Compute and transform the corners and find new min/max bounds. + XMVECTOR Corner = XMVectorMultiplyAdd( vExtents, g_BoxOffset[0], vCenter ); + Corner = XMVector3Rotate( Corner * VectorScale, Rotation ) + Translation; + + XMVECTOR Min, Max; + Min = Max = Corner; + + for( size_t i = 1; i < CORNER_COUNT; ++i ) + { + Corner = XMVectorMultiplyAdd( vExtents, g_BoxOffset[i], vCenter ); + Corner = XMVector3Rotate( Corner * VectorScale, Rotation ) + Translation; + + Min = XMVectorMin( Min, Corner ); + Max = XMVectorMax( Max, Corner ); + } + + // Store center and extents. + XMStoreFloat3( &Out.Center, ( Min + Max ) * 0.5f ); + XMStoreFloat3( &Out.Extents, ( Max - Min ) * 0.5f ); +} + + +//----------------------------------------------------------------------------- +// Get the corner points of the box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingBox::GetCorners( XMFLOAT3* Corners ) const +{ + assert( Corners != nullptr ); + + // Load the box + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + for( size_t i = 0; i < CORNER_COUNT; ++i ) + { + XMVECTOR C = XMVectorMultiplyAdd( vExtents, g_BoxOffset[i], vCenter ); + XMStoreFloat3( &Corners[i], C ); + } +} + + +//----------------------------------------------------------------------------- +// Point in axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingBox::Contains( FXMVECTOR Point ) const +{ + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + return XMVector3InBounds( Point - vCenter, vExtents ) ? CONTAINS : DISJOINT; +} + + +//----------------------------------------------------------------------------- +// Triangle in axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingBox::Contains( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const +{ + if ( !Intersects(V0,V1,V2) ) + return DISJOINT; + + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + XMVECTOR d = XMVector3LengthSq( V0 - vCenter ); + XMVECTOR Inside = XMVectorLessOrEqual( d, vExtents ); + + d = XMVector3LengthSq( V1 - vCenter ); + Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual( d, vExtents ) ); + + d = XMVector3LengthSq( V2 - vCenter ); + Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual( d, vExtents ) ); + + return ( XMVector3EqualInt( Inside, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Sphere in axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingBox::Contains( const BoundingSphere& sh ) const +{ + XMVECTOR SphereCenter = XMLoadFloat3( &sh.Center ); + XMVECTOR SphereRadius = XMVectorReplicatePtr( &sh.Radius ); + + XMVECTOR BoxCenter = XMLoadFloat3( &Center ); + XMVECTOR BoxExtents = XMLoadFloat3( &Extents ); + + XMVECTOR BoxMin = BoxCenter - BoxExtents; + XMVECTOR BoxMax = BoxCenter + BoxExtents; + + // Find the distance to the nearest point on the box. + // for each i in (x, y, z) + // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2 + // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2 + + XMVECTOR d = XMVectorZero(); + + // Compute d for each dimension. + XMVECTOR LessThanMin = XMVectorLess( SphereCenter, BoxMin ); + XMVECTOR GreaterThanMax = XMVectorGreater( SphereCenter, BoxMax ); + + XMVECTOR MinDelta = SphereCenter - BoxMin; + XMVECTOR MaxDelta = SphereCenter - BoxMax; + + // Choose value for each dimension based on the comparison. + d = XMVectorSelect( d, MinDelta, LessThanMin ); + d = XMVectorSelect( d, MaxDelta, GreaterThanMax ); + + // Use a dot-product to square them and sum them together. + XMVECTOR d2 = XMVector3Dot( d, d ); + + if ( XMVector3Greater( d2, XMVectorMultiply( SphereRadius, SphereRadius ) ) ) + return DISJOINT; + + XMVECTOR InsideAll = XMVectorLessOrEqual( BoxMin + SphereRadius, SphereCenter ); + InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( SphereCenter, BoxMax - SphereRadius ) ); + InsideAll = XMVectorAndInt( InsideAll, XMVectorGreater( BoxMax - BoxMin, SphereRadius ) ); + + return ( XMVector3EqualInt( InsideAll, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Axis-aligned box in axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingBox::Contains( const BoundingBox& box ) const +{ + XMVECTOR CenterA = XMLoadFloat3( &Center ); + XMVECTOR ExtentsA = XMLoadFloat3( &Extents ); + + XMVECTOR CenterB = XMLoadFloat3( &box.Center ); + XMVECTOR ExtentsB = XMLoadFloat3( &box.Extents ); + + XMVECTOR MinA = CenterA - ExtentsA; + XMVECTOR MaxA = CenterA + ExtentsA; + + XMVECTOR MinB = CenterB - ExtentsB; + XMVECTOR MaxB = CenterB + ExtentsB; + + // for each i in (x, y, z) if a_min(i) > b_max(i) or b_min(i) > a_max(i) then return false + XMVECTOR Disjoint = XMVectorOrInt( XMVectorGreater( MinA, MaxB ), XMVectorGreater( MinB, MaxA ) ); + + if ( DirectX::Internal::XMVector3AnyTrue( Disjoint ) ) + return DISJOINT; + + // for each i in (x, y, z) if a_min(i) <= b_min(i) and b_max(i) <= a_max(i) then A contains B + XMVECTOR Inside = XMVectorAndInt( XMVectorLessOrEqual( MinA, MinB ), XMVectorLessOrEqual( MaxB, MaxA ) ); + + return DirectX::Internal::XMVector3AllTrue( Inside ) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Oriented box in axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingBox::Contains( const BoundingOrientedBox& box ) const +{ + if ( !box.Intersects( *this ) ) + return DISJOINT; + + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + // Subtract off the AABB center to remove a subtract below + XMVECTOR oCenter = XMLoadFloat3( &box.Center ) - vCenter; + + XMVECTOR oExtents = XMLoadFloat3( &box.Extents ); + XMVECTOR oOrientation = XMLoadFloat4( &box.Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( oOrientation ) ); + + XMVECTOR Inside = XMVectorTrueInt(); + + for( size_t i=0; i < BoundingOrientedBox::CORNER_COUNT; ++i ) + { + XMVECTOR C = XMVector3Rotate( oExtents * g_BoxOffset[i], oOrientation ) + oCenter; + XMVECTOR d = XMVector3LengthSq( C ); + Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual( d, vExtents ) ); + } + + return ( XMVector3EqualInt( Inside, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Frustum in axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingBox::Contains( const BoundingFrustum& fr ) const +{ + if ( !fr.Intersects( *this ) ) + return DISJOINT; + + XMFLOAT3 Corners[BoundingFrustum::CORNER_COUNT]; + fr.GetCorners( Corners ); + + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + XMVECTOR Inside = XMVectorTrueInt(); + + for( size_t i=0; i < BoundingFrustum::CORNER_COUNT; ++i ) + { + XMVECTOR Point = XMLoadFloat3( &Corners[i] ); + XMVECTOR d = XMVector3LengthSq( Point - vCenter ); + Inside = XMVectorAndInt( Inside, XMVectorLessOrEqual( d, vExtents ) ); + } + + return ( XMVector3EqualInt( Inside, XMVectorTrueInt() ) ) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Sphere vs axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingBox::Intersects( const BoundingSphere& sh ) const +{ + XMVECTOR SphereCenter = XMLoadFloat3( &sh.Center ); + XMVECTOR SphereRadius = XMVectorReplicatePtr( &sh.Radius ); + + XMVECTOR BoxCenter = XMLoadFloat3( &Center ); + XMVECTOR BoxExtents = XMLoadFloat3( &Extents ); + + XMVECTOR BoxMin = BoxCenter - BoxExtents; + XMVECTOR BoxMax = BoxCenter + BoxExtents; + + // Find the distance to the nearest point on the box. + // for each i in (x, y, z) + // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2 + // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2 + + XMVECTOR d = XMVectorZero(); + + // Compute d for each dimension. + XMVECTOR LessThanMin = XMVectorLess( SphereCenter, BoxMin ); + XMVECTOR GreaterThanMax = XMVectorGreater( SphereCenter, BoxMax ); + + XMVECTOR MinDelta = SphereCenter - BoxMin; + XMVECTOR MaxDelta = SphereCenter - BoxMax; + + // Choose value for each dimension based on the comparison. + d = XMVectorSelect( d, MinDelta, LessThanMin ); + d = XMVectorSelect( d, MaxDelta, GreaterThanMax ); + + // Use a dot-product to square them and sum them together. + XMVECTOR d2 = XMVector3Dot( d, d ); + + return XMVector3LessOrEqual( d2, XMVectorMultiply( SphereRadius, SphereRadius ) ); +} + + +//----------------------------------------------------------------------------- +// Axis-aligned box vs. axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingBox::Intersects( const BoundingBox& box ) const +{ + XMVECTOR CenterA = XMLoadFloat3( &Center ); + XMVECTOR ExtentsA = XMLoadFloat3( &Extents ); + + XMVECTOR CenterB = XMLoadFloat3( &box.Center ); + XMVECTOR ExtentsB = XMLoadFloat3( &box.Extents ); + + XMVECTOR MinA = CenterA - ExtentsA; + XMVECTOR MaxA = CenterA + ExtentsA; + + XMVECTOR MinB = CenterB - ExtentsB; + XMVECTOR MaxB = CenterB + ExtentsB; + + // for each i in (x, y, z) if a_min(i) > b_max(i) or b_min(i) > a_max(i) then return false + XMVECTOR Disjoint = XMVectorOrInt( XMVectorGreater( MinA, MaxB ), XMVectorGreater( MinB, MaxA ) ); + + return !DirectX::Internal::XMVector3AnyTrue( Disjoint ); +} + + +//----------------------------------------------------------------------------- +// Oriented box vs. axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingBox::Intersects( const BoundingOrientedBox& box ) const +{ + return box.Intersects( *this ); +} + + +//----------------------------------------------------------------------------- +// Frustum vs. axis-aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingBox::Intersects( const BoundingFrustum& fr ) const +{ + return fr.Intersects( *this ); +} + + +//----------------------------------------------------------------------------- +// Triangle vs. axis aligned box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingBox::Intersects( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const +{ + XMVECTOR Zero = XMVectorZero(); + + // Load the box. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + XMVECTOR BoxMin = vCenter - vExtents; + XMVECTOR BoxMax = vCenter + vExtents; + + // Test the axes of the box (in effect test the AAB against the minimal AAB + // around the triangle). + XMVECTOR TriMin = XMVectorMin( XMVectorMin( V0, V1 ), V2 ); + XMVECTOR TriMax = XMVectorMax( XMVectorMax( V0, V1 ), V2 ); + + // for each i in (x, y, z) if a_min(i) > b_max(i) or b_min(i) > a_max(i) then disjoint + XMVECTOR Disjoint = XMVectorOrInt( XMVectorGreater( TriMin, BoxMax ), XMVectorGreater( BoxMin, TriMax ) ); + if( DirectX::Internal::XMVector3AnyTrue( Disjoint ) ) + return false; + + // Test the plane of the triangle. + XMVECTOR Normal = XMVector3Cross( V1 - V0, V2 - V0 ); + XMVECTOR Dist = XMVector3Dot( Normal, V0 ); + + // Assert that the triangle is not degenerate. + assert( !XMVector3Equal( Normal, Zero ) ); + + // for each i in (x, y, z) if n(i) >= 0 then v_min(i)=b_min(i), v_max(i)=b_max(i) + // else v_min(i)=b_max(i), v_max(i)=b_min(i) + XMVECTOR NormalSelect = XMVectorGreater( Normal, Zero ); + XMVECTOR V_Min = XMVectorSelect( BoxMax, BoxMin, NormalSelect ); + XMVECTOR V_Max = XMVectorSelect( BoxMin, BoxMax, NormalSelect ); + + // if n dot v_min + d > 0 || n dot v_max + d < 0 then disjoint + XMVECTOR MinDist = XMVector3Dot( V_Min, Normal ); + XMVECTOR MaxDist = XMVector3Dot( V_Max, Normal ); + + XMVECTOR NoIntersection = XMVectorGreater( MinDist, Dist ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( MaxDist, Dist ) ); + + // Move the box center to zero to simplify the following tests. + XMVECTOR TV0 = V0 - vCenter; + XMVECTOR TV1 = V1 - vCenter; + XMVECTOR TV2 = V2 - vCenter; + + // Test the edge/edge axes (3*3). + XMVECTOR e0 = TV1 - TV0; + XMVECTOR e1 = TV2 - TV1; + XMVECTOR e2 = TV0 - TV2; + + // Make w zero. + e0 = XMVectorInsert<0, 0, 0, 0, 1>( e0, Zero ); + e1 = XMVectorInsert<0, 0, 0, 0, 1>( e1, Zero ); + e2 = XMVectorInsert<0, 0, 0, 0, 1>( e2, Zero ); + + XMVECTOR Axis; + XMVECTOR p0, p1, p2; + XMVECTOR Min, Max; + XMVECTOR Radius; + + // Axis == (1,0,0) x e0 = (0, -e0.z, e0.y) + Axis = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_0X>( e0, -e0 ); + p0 = XMVector3Dot( TV0, Axis ); + // p1 = XMVector3Dot( V1, Axis ); // p1 = p0; + p2 = XMVector3Dot( TV2, Axis ); + Min = XMVectorMin( p0, p2 ); + Max = XMVectorMax( p0, p2 ); + Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) ); + + // Axis == (1,0,0) x e1 = (0, -e1.z, e1.y) + Axis = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_0X>( e1, -e1 ); + p0 = XMVector3Dot( TV0, Axis ); + p1 = XMVector3Dot( TV1, Axis ); + // p2 = XMVector3Dot( V2, Axis ); // p2 = p1; + Min = XMVectorMin( p0, p1 ); + Max = XMVectorMax( p0, p1 ); + Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) ); + + // Axis == (1,0,0) x e2 = (0, -e2.z, e2.y) + Axis = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_0X>( e2, -e2 ); + p0 = XMVector3Dot( TV0, Axis ); + p1 = XMVector3Dot( TV1, Axis ); + // p2 = XMVector3Dot( V2, Axis ); // p2 = p0; + Min = XMVectorMin( p0, p1 ); + Max = XMVectorMax( p0, p1 ); + Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) ); + + // Axis == (0,1,0) x e0 = (e0.z, 0, -e0.x) + Axis = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X, XM_PERMUTE_0Y>( e0, -e0 ); + p0 = XMVector3Dot( TV0, Axis ); + // p1 = XMVector3Dot( V1, Axis ); // p1 = p0; + p2 = XMVector3Dot( TV2, Axis ); + Min = XMVectorMin( p0, p2 ); + Max = XMVectorMax( p0, p2 ); + Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) ); + + // Axis == (0,1,0) x e1 = (e1.z, 0, -e1.x) + Axis = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X, XM_PERMUTE_0Y>( e1, -e1 ); + p0 = XMVector3Dot( TV0, Axis ); + p1 = XMVector3Dot( TV1, Axis ); + // p2 = XMVector3Dot( V2, Axis ); // p2 = p1; + Min = XMVectorMin( p0, p1 ); + Max = XMVectorMax( p0, p1 ); + Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) ); + + // Axis == (0,0,1) x e2 = (e2.z, 0, -e2.x) + Axis = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X, XM_PERMUTE_0Y>( e2, -e2 ); + p0 = XMVector3Dot( TV0, Axis ); + p1 = XMVector3Dot( TV1, Axis ); + // p2 = XMVector3Dot( V2, Axis ); // p2 = p0; + Min = XMVectorMin( p0, p1 ); + Max = XMVectorMax( p0, p1 ); + Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) ); + + // Axis == (0,0,1) x e0 = (-e0.y, e0.x, 0) + Axis = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_0Z>( e0, -e0 ); + p0 = XMVector3Dot( TV0, Axis ); + // p1 = XMVector3Dot( V1, Axis ); // p1 = p0; + p2 = XMVector3Dot( TV2, Axis ); + Min = XMVectorMin( p0, p2 ); + Max = XMVectorMax( p0, p2 ); + Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) ); + + // Axis == (0,0,1) x e1 = (-e1.y, e1.x, 0) + Axis = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_0Z>( e1, -e1 ); + p0 = XMVector3Dot( TV0, Axis ); + p1 = XMVector3Dot( TV1, Axis ); + // p2 = XMVector3Dot( V2, Axis ); // p2 = p1; + Min = XMVectorMin( p0, p1 ); + Max = XMVectorMax( p0, p1 ); + Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) ); + + // Axis == (0,0,1) x e2 = (-e2.y, e2.x, 0) + Axis = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_0Z>( e2, -e2 ); + p0 = XMVector3Dot( TV0, Axis ); + p1 = XMVector3Dot( TV1, Axis ); + // p2 = XMVector3Dot( V2, Axis ); // p2 = p0; + Min = XMVectorMin( p0, p1 ); + Max = XMVectorMax( p0, p1 ); + Radius = XMVector3Dot( vExtents, XMVectorAbs( Axis ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( Min, Radius ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( Max, -Radius ) ); + + return XMVector4NotEqualInt( NoIntersection, XMVectorTrueInt() ); +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline PlaneIntersectionType BoundingBox::Intersects( FXMVECTOR Plane ) const +{ + assert( DirectX::Internal::XMPlaneIsUnit( Plane ) ); + + // Load the box. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + // Set w of the center to one so we can dot4 with a plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() ); + + XMVECTOR Outside, Inside; + DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane, Outside, Inside ); + + // If the box is outside any plane it is outside. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return FRONT; + + // If the box is inside all planes it is inside. + if ( XMVector4EqualInt( Inside, XMVectorTrueInt() ) ) + return BACK; + + // The box is not inside all planes or outside a plane it intersects. + return INTERSECTING; +} + + +//----------------------------------------------------------------------------- +// Compute the intersection of a ray (Origin, Direction) with an axis aligned +// box using the slabs method. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingBox::Intersects( FXMVECTOR Origin, FXMVECTOR Direction, float& Dist ) const +{ + assert( DirectX::Internal::XMVector3IsUnit( Direction ) ); + + // Load the box. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + // Adjust ray origin to be relative to center of the box. + XMVECTOR TOrigin = vCenter - Origin; + + // Compute the dot product againt each axis of the box. + // Since the axii are (1,0,0), (0,1,0), (0,0,1) no computation is necessary. + XMVECTOR AxisDotOrigin = TOrigin; + XMVECTOR AxisDotDirection = Direction; + + // if (fabs(AxisDotDirection) <= Epsilon) the ray is nearly parallel to the slab. + XMVECTOR IsParallel = XMVectorLessOrEqual( XMVectorAbs( AxisDotDirection ), g_RayEpsilon ); + + // Test against all three axii simultaneously. + XMVECTOR InverseAxisDotDirection = XMVectorReciprocal( AxisDotDirection ); + XMVECTOR t1 = ( AxisDotOrigin - vExtents ) * InverseAxisDotDirection; + XMVECTOR t2 = ( AxisDotOrigin + vExtents ) * InverseAxisDotDirection; + + // Compute the max of min(t1,t2) and the min of max(t1,t2) ensuring we don't + // use the results from any directions parallel to the slab. + XMVECTOR t_min = XMVectorSelect( XMVectorMin( t1, t2 ), g_FltMin, IsParallel ); + XMVECTOR t_max = XMVectorSelect( XMVectorMax( t1, t2 ), g_FltMax, IsParallel ); + + // t_min.x = maximum( t_min.x, t_min.y, t_min.z ); + // t_max.x = minimum( t_max.x, t_max.y, t_max.z ); + t_min = XMVectorMax( t_min, XMVectorSplatY( t_min ) ); // x = max(x,y) + t_min = XMVectorMax( t_min, XMVectorSplatZ( t_min ) ); // x = max(max(x,y),z) + t_max = XMVectorMin( t_max, XMVectorSplatY( t_max ) ); // x = min(x,y) + t_max = XMVectorMin( t_max, XMVectorSplatZ( t_max ) ); // x = min(min(x,y),z) + + // if ( t_min > t_max ) return false; + XMVECTOR NoIntersection = XMVectorGreater( XMVectorSplatX( t_min ), XMVectorSplatX( t_max ) ); + + // if ( t_max < 0.0f ) return false; + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( XMVectorSplatX( t_max ), XMVectorZero() ) ); + + // if (IsParallel && (-Extents > AxisDotOrigin || Extents < AxisDotOrigin)) return false; + XMVECTOR ParallelOverlap = XMVectorInBounds( AxisDotOrigin, vExtents ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorAndCInt( IsParallel, ParallelOverlap ) ); + + if( !DirectX::Internal::XMVector3AnyTrue( NoIntersection ) ) + { + // Store the x-component to *pDist + XMStoreFloat( &Dist, t_min ); + return true; + } + + Dist = 0.f; + return false; +} + + +//----------------------------------------------------------------------------- +// Test an axis alinged box vs 6 planes (typically forming a frustum). +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingBox::ContainedBy( FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2, + GXMVECTOR Plane3, CXMVECTOR Plane4, CXMVECTOR Plane5 ) const +{ + // Load the box. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + + // Set w of the center to one so we can dot4 with a plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() ); + + XMVECTOR Outside, Inside; + + // Test against each plane. + DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane0, Outside, Inside ); + + XMVECTOR AnyOutside = Outside; + XMVECTOR AllInside = Inside; + + DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane1, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane2, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane3, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane4, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectAxisAlignedBoxPlane( vCenter, vExtents, Plane5, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + // If the box is outside any plane it is outside. + if ( XMVector4EqualInt( AnyOutside, XMVectorTrueInt() ) ) + return DISJOINT; + + // If the box is inside all planes it is inside. + if ( XMVector4EqualInt( AllInside, XMVectorTrueInt() ) ) + return CONTAINS; + + // The box is not inside all planes or outside a plane, it may intersect. + return INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Create axis-aligned box that contains two other bounding boxes +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingBox::CreateMerged( BoundingBox& Out, const BoundingBox& b1, const BoundingBox& b2 ) +{ + XMVECTOR b1Center = XMLoadFloat3( &b1.Center ); + XMVECTOR b1Extents = XMLoadFloat3( &b1.Extents ); + + XMVECTOR b2Center = XMLoadFloat3( &b2.Center ); + XMVECTOR b2Extents = XMLoadFloat3( &b2.Extents ); + + XMVECTOR Min = XMVectorSubtract( b1Center, b1Extents ); + Min = XMVectorMin( Min, XMVectorSubtract( b2Center, b2Extents ) ); + + XMVECTOR Max = XMVectorAdd( b1Center, b1Extents ); + Max = XMVectorMax( Max, XMVectorAdd( b2Center, b2Extents ) ); + + assert( XMVector3LessOrEqual( Min, Max ) ); + + XMStoreFloat3( &Out.Center, ( Min + Max ) * 0.5f ); + XMStoreFloat3( &Out.Extents, ( Max - Min ) * 0.5f ); +} + + +//----------------------------------------------------------------------------- +// Create axis-aligned box that contains a bounding sphere +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingBox::CreateFromSphere( BoundingBox& Out, const BoundingSphere& sh ) +{ + XMVECTOR spCenter = XMLoadFloat3( &sh.Center ); + XMVECTOR shRadius = XMVectorReplicatePtr( &sh.Radius ); + + XMVECTOR Min = XMVectorSubtract( spCenter, shRadius ); + XMVECTOR Max = XMVectorAdd( spCenter, shRadius ); + + assert( XMVector3LessOrEqual( Min, Max ) ); + + XMStoreFloat3( &Out.Center, ( Min + Max ) * 0.5f ); + XMStoreFloat3( &Out.Extents, ( Max - Min ) * 0.5f ); +} + + +//----------------------------------------------------------------------------- +// Create axis-aligned box from min/max points +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingBox::CreateFromPoints( BoundingBox& Out, FXMVECTOR pt1, FXMVECTOR pt2 ) +{ + XMVECTOR Min = XMVectorMin( pt1, pt2 ); + XMVECTOR Max = XMVectorMax( pt1, pt2 ); + + // Store center and extents. + XMStoreFloat3( &Out.Center, ( Min + Max ) * 0.5f ); + XMStoreFloat3( &Out.Extents, ( Max - Min ) * 0.5f ); +} + + +//----------------------------------------------------------------------------- +// Find the minimum axis aligned bounding box containing a set of points. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingBox::CreateFromPoints( BoundingBox& Out, size_t Count, const XMFLOAT3* pPoints, size_t Stride ) +{ + assert( Count > 0 ); + assert( pPoints ); + + // Find the minimum and maximum x, y, and z + XMVECTOR vMin, vMax; + + vMin = vMax = XMLoadFloat3( pPoints ); + + for( size_t i = 1; i < Count; ++i ) + { + XMVECTOR Point = XMLoadFloat3( reinterpret_cast<const XMFLOAT3*>( reinterpret_cast<const uint8_t*>(pPoints) + i * Stride ) ); + + vMin = XMVectorMin( vMin, Point ); + vMax = XMVectorMax( vMax, Point ); + } + + // Store center and extents. + XMStoreFloat3( &Out.Center, ( vMin + vMax ) * 0.5f ); + XMStoreFloat3( &Out.Extents, ( vMax - vMin ) * 0.5f ); +} + + +/**************************************************************************** + * + * BoundingOrientedBox + * + ****************************************************************************/ + +//----------------------------------------------------------------------------- +// Transform an oriented box by an angle preserving transform. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingOrientedBox::Transform( BoundingOrientedBox& Out, CXMMATRIX M ) const +{ + // Load the box. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Composite the box rotation and the transform rotation. + XMVECTOR Rotation = XMQuaternionRotationMatrix( M ); + vOrientation = XMQuaternionMultiply( vOrientation, Rotation ); + + // Transform the center. + vCenter = XMVector3Transform( vCenter, M ); + + // Scale the box extents. + XMVECTOR dX = XMVector3Length( M.r[0] ); + XMVECTOR dY = XMVector3Length( M.r[1] ); + XMVECTOR dZ = XMVector3Length( M.r[2] ); + + XMVECTOR VectorScale = XMVectorSelect( dX, dY, g_XMSelect1000 ); + VectorScale = XMVectorSelect( VectorScale, dZ, g_XMSelect1100 ); + vExtents = vExtents * VectorScale; + + // Store the box. + XMStoreFloat3( &Out.Center, vCenter ); + XMStoreFloat3( &Out.Extents, vExtents ); + XMStoreFloat4( &Out.Orientation, vOrientation ); +} + +_Use_decl_annotations_ +inline void BoundingOrientedBox::Transform( BoundingOrientedBox& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation ) const +{ + assert( DirectX::Internal::XMQuaternionIsUnit( Rotation ) ); + + // Load the box. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Composite the box rotation and the transform rotation. + vOrientation = XMQuaternionMultiply( vOrientation, Rotation ); + + // Transform the center. + XMVECTOR VectorScale = XMVectorReplicate( Scale ); + vCenter = XMVector3Rotate( vCenter * VectorScale, Rotation ) + Translation; + + // Scale the box extents. + vExtents = vExtents * VectorScale; + + // Store the box. + XMStoreFloat3( &Out.Center, vCenter ); + XMStoreFloat3( &Out.Extents, vExtents ); + XMStoreFloat4( &Out.Orientation, vOrientation ); +} + + +//----------------------------------------------------------------------------- +// Get the corner points of the box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingOrientedBox::GetCorners( XMFLOAT3* Corners ) const +{ + assert( Corners != 0 ); + + // Load the box + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + for( size_t i = 0; i < CORNER_COUNT; ++i ) + { + XMVECTOR C = XMVector3Rotate( vExtents * g_BoxOffset[i], vOrientation ) + vCenter; + XMStoreFloat3( &Corners[i], C ); + } +} + + +//----------------------------------------------------------------------------- +// Point in oriented box test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingOrientedBox::Contains( FXMVECTOR Point ) const +{ + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + // Transform the point to be local to the box. + XMVECTOR TPoint = XMVector3InverseRotate( Point - vCenter, vOrientation ); + + return XMVector3InBounds( TPoint, vExtents ) ? CONTAINS : DISJOINT; +} + + +//----------------------------------------------------------------------------- +// Triangle in oriented bounding box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingOrientedBox::Contains( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const +{ + // Load the box center & orientation. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + // Transform the triangle vertices into the space of the box. + XMVECTOR TV0 = XMVector3InverseRotate( V0 - vCenter, vOrientation ); + XMVECTOR TV1 = XMVector3InverseRotate( V1 - vCenter, vOrientation ); + XMVECTOR TV2 = XMVector3InverseRotate( V2 - vCenter, vOrientation ); + + BoundingBox box; + box.Center = XMFLOAT3( 0.0f, 0.0f, 0.0f ); + box.Extents = Extents; + + // Use the triangle vs axis aligned box intersection routine. + return box.Contains( TV0, TV1, TV2 ); +} + + +//----------------------------------------------------------------------------- +// Sphere in oriented bounding box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingOrientedBox::Contains( const BoundingSphere& sh ) const +{ + XMVECTOR SphereCenter = XMLoadFloat3( &sh.Center ); + XMVECTOR SphereRadius = XMVectorReplicatePtr( &sh.Radius ); + + XMVECTOR BoxCenter = XMLoadFloat3( &Center ); + XMVECTOR BoxExtents = XMLoadFloat3( &Extents ); + XMVECTOR BoxOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( BoxOrientation ) ); + + // Transform the center of the sphere to be local to the box. + // BoxMin = -BoxExtents + // BoxMax = +BoxExtents + SphereCenter = XMVector3InverseRotate( SphereCenter - BoxCenter, BoxOrientation ); + + // Find the distance to the nearest point on the box. + // for each i in (x, y, z) + // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2 + // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2 + + XMVECTOR d = XMVectorZero(); + + // Compute d for each dimension. + XMVECTOR LessThanMin = XMVectorLess( SphereCenter, -BoxExtents ); + XMVECTOR GreaterThanMax = XMVectorGreater( SphereCenter, BoxExtents ); + + XMVECTOR MinDelta = SphereCenter + BoxExtents; + XMVECTOR MaxDelta = SphereCenter - BoxExtents; + + // Choose value for each dimension based on the comparison. + d = XMVectorSelect( d, MinDelta, LessThanMin ); + d = XMVectorSelect( d, MaxDelta, GreaterThanMax ); + + // Use a dot-product to square them and sum them together. + XMVECTOR d2 = XMVector3Dot( d, d ); + XMVECTOR SphereRadiusSq = XMVectorMultiply( SphereRadius, SphereRadius ); + + if ( XMVector4Greater( d2, SphereRadiusSq ) ) + return DISJOINT; + + // See if we are completely inside the box + XMVECTOR SMin = SphereCenter - SphereRadius; + XMVECTOR SMax = SphereCenter + SphereRadius; + + return ( XMVector3InBounds( SMin, BoxExtents ) && XMVector3InBounds( SMax, BoxExtents ) ) ? CONTAINS : INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Axis aligned box vs. oriented box. Constructs an oriented box and uses +// the oriented box vs. oriented box test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingOrientedBox::Contains( const BoundingBox& box ) const +{ + // Make the axis aligned box oriented and do an OBB vs OBB test. + BoundingOrientedBox obox( box.Center, box.Extents, XMFLOAT4( 0.f, 0.f, 0.f, 1.f ) ); + return Contains( obox ); +} + + +//----------------------------------------------------------------------------- +// Oriented bounding box in oriented bounding box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingOrientedBox::Contains( const BoundingOrientedBox& box ) const +{ + if ( !Intersects(box) ) + return DISJOINT; + + // Load the boxes + XMVECTOR aCenter = XMLoadFloat3( &Center ); + XMVECTOR aExtents = XMLoadFloat3( &Extents ); + XMVECTOR aOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( aOrientation ) ); + + XMVECTOR bCenter = XMLoadFloat3( &box.Center ); + XMVECTOR bExtents = XMLoadFloat3( &box.Extents ); + XMVECTOR bOrientation = XMLoadFloat4( &box.Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( bOrientation ) ); + + XMVECTOR offset = bCenter - aCenter; + + for( size_t i = 0; i < CORNER_COUNT; ++i ) + { + // Cb = rotate( bExtents * corneroffset[i], bOrientation ) + bcenter + // Ca = invrotate( Cb - aCenter, aOrientation ) + + XMVECTOR C = XMVector3Rotate( bExtents * g_BoxOffset[i], bOrientation ) + offset; + C = XMVector3InverseRotate( C , aOrientation ); + + if ( !XMVector3InBounds( C, aExtents ) ) + return INTERSECTS; + } + + return CONTAINS; +} + + +//----------------------------------------------------------------------------- +// Frustum in oriented bounding box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingOrientedBox::Contains( const BoundingFrustum& fr ) const +{ + if ( !fr.Intersects(*this) ) + return DISJOINT; + + XMFLOAT3 Corners[BoundingFrustum::CORNER_COUNT]; + fr.GetCorners( Corners ); + + // Load the box + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + for( size_t i = 0; i < BoundingFrustum::CORNER_COUNT; ++i ) + { + XMVECTOR C = XMVector3InverseRotate( XMLoadFloat3( &Corners[i] ) - vCenter, vOrientation ); + + if ( !XMVector3InBounds( C, vExtents ) ) + return INTERSECTS; + } + + return CONTAINS; +} + + +//----------------------------------------------------------------------------- +// Sphere vs. oriented box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingOrientedBox::Intersects( const BoundingSphere& sh ) const +{ + XMVECTOR SphereCenter = XMLoadFloat3( &sh.Center ); + XMVECTOR SphereRadius = XMVectorReplicatePtr( &sh.Radius ); + + XMVECTOR BoxCenter = XMLoadFloat3( &Center ); + XMVECTOR BoxExtents = XMLoadFloat3( &Extents ); + XMVECTOR BoxOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( BoxOrientation ) ); + + // Transform the center of the sphere to be local to the box. + // BoxMin = -BoxExtents + // BoxMax = +BoxExtents + SphereCenter = XMVector3InverseRotate( SphereCenter - BoxCenter, BoxOrientation ); + + // Find the distance to the nearest point on the box. + // for each i in (x, y, z) + // if (SphereCenter(i) < BoxMin(i)) d2 += (SphereCenter(i) - BoxMin(i)) ^ 2 + // else if (SphereCenter(i) > BoxMax(i)) d2 += (SphereCenter(i) - BoxMax(i)) ^ 2 + + XMVECTOR d = XMVectorZero(); + + // Compute d for each dimension. + XMVECTOR LessThanMin = XMVectorLess( SphereCenter, -BoxExtents ); + XMVECTOR GreaterThanMax = XMVectorGreater( SphereCenter, BoxExtents ); + + XMVECTOR MinDelta = SphereCenter + BoxExtents; + XMVECTOR MaxDelta = SphereCenter - BoxExtents; + + // Choose value for each dimension based on the comparison. + d = XMVectorSelect( d, MinDelta, LessThanMin ); + d = XMVectorSelect( d, MaxDelta, GreaterThanMax ); + + // Use a dot-product to square them and sum them together. + XMVECTOR d2 = XMVector3Dot( d, d ); + + return XMVector4LessOrEqual( d2, XMVectorMultiply( SphereRadius, SphereRadius ) ) ? true : false; +} + + +//----------------------------------------------------------------------------- +// Axis aligned box vs. oriented box. Constructs an oriented box and uses +// the oriented box vs. oriented box test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingOrientedBox::Intersects( const BoundingBox& box ) const +{ + // Make the axis aligned box oriented and do an OBB vs OBB test. + BoundingOrientedBox obox( box.Center, box.Extents, XMFLOAT4( 0.f, 0.f, 0.f, 1.f ) ); + return Intersects( obox ); +} + + +//----------------------------------------------------------------------------- +// Fast oriented box / oriented box intersection test using the separating axis +// theorem. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingOrientedBox::Intersects( const BoundingOrientedBox& box ) const +{ + // Build the 3x3 rotation matrix that defines the orientation of B relative to A. + XMVECTOR A_quat = XMLoadFloat4( &Orientation ); + XMVECTOR B_quat = XMLoadFloat4( &box.Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( A_quat ) ); + assert( DirectX::Internal::XMQuaternionIsUnit( B_quat ) ); + + XMVECTOR Q = XMQuaternionMultiply( A_quat, XMQuaternionConjugate( B_quat ) ); + XMMATRIX R = XMMatrixRotationQuaternion( Q ); + + // Compute the translation of B relative to A. + XMVECTOR A_cent = XMLoadFloat3( &Center ); + XMVECTOR B_cent = XMLoadFloat3( &box.Center ); + XMVECTOR t = XMVector3InverseRotate( B_cent - A_cent, A_quat ); + + // + // h(A) = extents of A. + // h(B) = extents of B. + // + // a(u) = axes of A = (1,0,0), (0,1,0), (0,0,1) + // b(u) = axes of B relative to A = (r00,r10,r20), (r01,r11,r21), (r02,r12,r22) + // + // For each possible separating axis l: + // d(A) = sum (for i = u,v,w) h(A)(i) * abs( a(i) dot l ) + // d(B) = sum (for i = u,v,w) h(B)(i) * abs( b(i) dot l ) + // if abs( t dot l ) > d(A) + d(B) then disjoint + // + + // Load extents of A and B. + XMVECTOR h_A = XMLoadFloat3( &Extents ); + XMVECTOR h_B = XMLoadFloat3( &box.Extents ); + + // Rows. Note R[0,1,2]X.w = 0. + XMVECTOR R0X = R.r[0]; + XMVECTOR R1X = R.r[1]; + XMVECTOR R2X = R.r[2]; + + R = XMMatrixTranspose( R ); + + // Columns. Note RX[0,1,2].w = 0. + XMVECTOR RX0 = R.r[0]; + XMVECTOR RX1 = R.r[1]; + XMVECTOR RX2 = R.r[2]; + + // Absolute value of rows. + XMVECTOR AR0X = XMVectorAbs( R0X ); + XMVECTOR AR1X = XMVectorAbs( R1X ); + XMVECTOR AR2X = XMVectorAbs( R2X ); + + // Absolute value of columns. + XMVECTOR ARX0 = XMVectorAbs( RX0 ); + XMVECTOR ARX1 = XMVectorAbs( RX1 ); + XMVECTOR ARX2 = XMVectorAbs( RX2 ); + + // Test each of the 15 possible seperating axii. + XMVECTOR d, d_A, d_B; + + // l = a(u) = (1, 0, 0) + // t dot l = t.x + // d(A) = h(A).x + // d(B) = h(B) dot abs(r00, r01, r02) + d = XMVectorSplatX( t ); + d_A = XMVectorSplatX( h_A ); + d_B = XMVector3Dot( h_B, AR0X ); + XMVECTOR NoIntersection = XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ); + + // l = a(v) = (0, 1, 0) + // t dot l = t.y + // d(A) = h(A).y + // d(B) = h(B) dot abs(r10, r11, r12) + d = XMVectorSplatY( t ); + d_A = XMVectorSplatY( h_A ); + d_B = XMVector3Dot( h_B, AR1X ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = a(w) = (0, 0, 1) + // t dot l = t.z + // d(A) = h(A).z + // d(B) = h(B) dot abs(r20, r21, r22) + d = XMVectorSplatZ( t ); + d_A = XMVectorSplatZ( h_A ); + d_B = XMVector3Dot( h_B, AR2X ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = b(u) = (r00, r10, r20) + // d(A) = h(A) dot abs(r00, r10, r20) + // d(B) = h(B).x + d = XMVector3Dot( t, RX0 ); + d_A = XMVector3Dot( h_A, ARX0 ); + d_B = XMVectorSplatX( h_B ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = b(v) = (r01, r11, r21) + // d(A) = h(A) dot abs(r01, r11, r21) + // d(B) = h(B).y + d = XMVector3Dot( t, RX1 ); + d_A = XMVector3Dot( h_A, ARX1 ); + d_B = XMVectorSplatY( h_B ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = b(w) = (r02, r12, r22) + // d(A) = h(A) dot abs(r02, r12, r22) + // d(B) = h(B).z + d = XMVector3Dot( t, RX2 ); + d_A = XMVector3Dot( h_A, ARX2 ); + d_B = XMVectorSplatZ( h_B ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = a(u) x b(u) = (0, -r20, r10) + // d(A) = h(A) dot abs(0, r20, r10) + // d(B) = h(B) dot abs(0, r02, r01) + d = XMVector3Dot( t, XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_0X>( RX0, -RX0 ) ); + d_A = XMVector3Dot( h_A, XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_X>( ARX0 ) ); + d_B = XMVector3Dot( h_B, XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_X>( AR0X ) ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = a(u) x b(v) = (0, -r21, r11) + // d(A) = h(A) dot abs(0, r21, r11) + // d(B) = h(B) dot abs(r02, 0, r00) + d = XMVector3Dot( t, XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_0X>( RX1, -RX1 ) ); + d_A = XMVector3Dot( h_A, XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_X>( ARX1 ) ); + d_B = XMVector3Dot( h_B, XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Y>( AR0X ) ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = a(u) x b(w) = (0, -r22, r12) + // d(A) = h(A) dot abs(0, r22, r12) + // d(B) = h(B) dot abs(r01, r00, 0) + d = XMVector3Dot( t, XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_0X>( RX2, -RX2 ) ); + d_A = XMVector3Dot( h_A, XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_X>( ARX2 ) ); + d_B = XMVector3Dot( h_B, XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_Z>( AR0X ) ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = a(v) x b(u) = (r20, 0, -r00) + // d(A) = h(A) dot abs(r20, 0, r00) + // d(B) = h(B) dot abs(0, r12, r11) + d = XMVector3Dot( t, XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X, XM_PERMUTE_0Y>( RX0, -RX0 ) ); + d_A = XMVector3Dot( h_A, XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Y>( ARX0 ) ); + d_B = XMVector3Dot( h_B, XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_X>( AR1X ) ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = a(v) x b(v) = (r21, 0, -r01) + // d(A) = h(A) dot abs(r21, 0, r01) + // d(B) = h(B) dot abs(r12, 0, r10) + d = XMVector3Dot( t, XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X, XM_PERMUTE_0Y>( RX1, -RX1 ) ); + d_A = XMVector3Dot( h_A, XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Y>( ARX1 ) ); + d_B = XMVector3Dot( h_B, XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Y>( AR1X ) ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = a(v) x b(w) = (r22, 0, -r02) + // d(A) = h(A) dot abs(r22, 0, r02) + // d(B) = h(B) dot abs(r11, r10, 0) + d = XMVector3Dot( t, XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0W, XM_PERMUTE_1X, XM_PERMUTE_0Y>( RX2, -RX2 ) ); + d_A = XMVector3Dot( h_A, XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Y>( ARX2 ) ); + d_B = XMVector3Dot( h_B, XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_Z>( AR1X ) ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = a(w) x b(u) = (-r10, r00, 0) + // d(A) = h(A) dot abs(r10, r00, 0) + // d(B) = h(B) dot abs(0, r22, r21) + d = XMVector3Dot( t, XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_0Z>( RX0, -RX0 ) ); + d_A = XMVector3Dot( h_A, XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_Z>( ARX0 ) ); + d_B = XMVector3Dot( h_B, XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_X>( AR2X ) ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = a(w) x b(v) = (-r11, r01, 0) + // d(A) = h(A) dot abs(r11, r01, 0) + // d(B) = h(B) dot abs(r22, 0, r20) + d = XMVector3Dot( t, XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_0Z>( RX1, -RX1 ) ); + d_A = XMVector3Dot( h_A, XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_Z>( ARX1 ) ); + d_B = XMVector3Dot( h_B, XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Y>( AR2X ) ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // l = a(w) x b(w) = (-r12, r02, 0) + // d(A) = h(A) dot abs(r12, r02, 0) + // d(B) = h(B) dot abs(r21, r20, 0) + d = XMVector3Dot( t, XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_0Z>( RX2, -RX2 ) ); + d_A = XMVector3Dot( h_A, XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_Z>( ARX2 ) ); + d_B = XMVector3Dot( h_B, XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_Z>( AR2X ) ); + NoIntersection = XMVectorOrInt( NoIntersection, + XMVectorGreater( XMVectorAbs(d), XMVectorAdd( d_A, d_B ) ) ); + + // No seperating axis found, boxes must intersect. + return XMVector4NotEqualInt( NoIntersection, XMVectorTrueInt() ) ? true : false; +} + + +//----------------------------------------------------------------------------- +// Frustum vs. oriented box test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingOrientedBox::Intersects( const BoundingFrustum& fr ) const +{ + return fr.Intersects( *this ); +} + + +//----------------------------------------------------------------------------- +// Triangle vs. oriented box test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingOrientedBox::Intersects( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const +{ + // Load the box center & orientation. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + // Transform the triangle vertices into the space of the box. + XMVECTOR TV0 = XMVector3InverseRotate( V0 - vCenter, vOrientation ); + XMVECTOR TV1 = XMVector3InverseRotate( V1 - vCenter, vOrientation ); + XMVECTOR TV2 = XMVector3InverseRotate( V2 - vCenter, vOrientation ); + + BoundingBox box; + box.Center = XMFLOAT3( 0.0f, 0.0f, 0.0f ); + box.Extents = Extents; + + // Use the triangle vs axis aligned box intersection routine. + return box.Intersects( TV0, TV1, TV2 ); +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline PlaneIntersectionType BoundingOrientedBox::Intersects( FXMVECTOR Plane ) const +{ + assert( DirectX::Internal::XMPlaneIsUnit( Plane ) ); + + // Load the box. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + XMVECTOR BoxOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( BoxOrientation ) ); + + // Set w of the center to one so we can dot4 with a plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() ); + + // Build the 3x3 rotation matrix that defines the box axes. + XMMATRIX R = XMMatrixRotationQuaternion( BoxOrientation ); + + XMVECTOR Outside, Inside; + DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane, Outside, Inside ); + + // If the box is outside any plane it is outside. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return FRONT; + + // If the box is inside all planes it is inside. + if ( XMVector4EqualInt( Inside, XMVectorTrueInt() ) ) + return BACK; + + // The box is not inside all planes or outside a plane it intersects. + return INTERSECTING; +} + + +//----------------------------------------------------------------------------- +// Compute the intersection of a ray (Origin, Direction) with an oriented box +// using the slabs method. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingOrientedBox::Intersects( FXMVECTOR Origin, FXMVECTOR Direction, float& Dist ) const +{ + assert( DirectX::Internal::XMVector3IsUnit( Direction ) ); + + static const XMVECTORI32 SelectY = + { + XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0 + }; + static const XMVECTORI32 SelectZ = + { + XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 + }; + + // Load the box. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Get the boxes normalized side directions. + XMMATRIX R = XMMatrixRotationQuaternion( vOrientation ); + + // Adjust ray origin to be relative to center of the box. + XMVECTOR TOrigin = vCenter - Origin; + + // Compute the dot product againt each axis of the box. + XMVECTOR AxisDotOrigin = XMVector3Dot( R.r[0], TOrigin ); + AxisDotOrigin = XMVectorSelect( AxisDotOrigin, XMVector3Dot( R.r[1], TOrigin ), SelectY ); + AxisDotOrigin = XMVectorSelect( AxisDotOrigin, XMVector3Dot( R.r[2], TOrigin ), SelectZ ); + + XMVECTOR AxisDotDirection = XMVector3Dot( R.r[0], Direction ); + AxisDotDirection = XMVectorSelect( AxisDotDirection, XMVector3Dot( R.r[1], Direction ), SelectY ); + AxisDotDirection = XMVectorSelect( AxisDotDirection, XMVector3Dot( R.r[2], Direction ), SelectZ ); + + // if (fabs(AxisDotDirection) <= Epsilon) the ray is nearly parallel to the slab. + XMVECTOR IsParallel = XMVectorLessOrEqual( XMVectorAbs( AxisDotDirection ), g_RayEpsilon ); + + // Test against all three axes simultaneously. + XMVECTOR InverseAxisDotDirection = XMVectorReciprocal( AxisDotDirection ); + XMVECTOR t1 = ( AxisDotOrigin - vExtents ) * InverseAxisDotDirection; + XMVECTOR t2 = ( AxisDotOrigin + vExtents ) * InverseAxisDotDirection; + + // Compute the max of min(t1,t2) and the min of max(t1,t2) ensuring we don't + // use the results from any directions parallel to the slab. + XMVECTOR t_min = XMVectorSelect( XMVectorMin( t1, t2 ), g_FltMin, IsParallel ); + XMVECTOR t_max = XMVectorSelect( XMVectorMax( t1, t2 ), g_FltMax, IsParallel ); + + // t_min.x = maximum( t_min.x, t_min.y, t_min.z ); + // t_max.x = minimum( t_max.x, t_max.y, t_max.z ); + t_min = XMVectorMax( t_min, XMVectorSplatY( t_min ) ); // x = max(x,y) + t_min = XMVectorMax( t_min, XMVectorSplatZ( t_min ) ); // x = max(max(x,y),z) + t_max = XMVectorMin( t_max, XMVectorSplatY( t_max ) ); // x = min(x,y) + t_max = XMVectorMin( t_max, XMVectorSplatZ( t_max ) ); // x = min(min(x,y),z) + + // if ( t_min > t_max ) return false; + XMVECTOR NoIntersection = XMVectorGreater( XMVectorSplatX( t_min ), XMVectorSplatX( t_max ) ); + + // if ( t_max < 0.0f ) return false; + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( XMVectorSplatX( t_max ), XMVectorZero() ) ); + + // if (IsParallel && (-Extents > AxisDotOrigin || Extents < AxisDotOrigin)) return false; + XMVECTOR ParallelOverlap = XMVectorInBounds( AxisDotOrigin, vExtents ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorAndCInt( IsParallel, ParallelOverlap ) ); + + if( !DirectX::Internal::XMVector3AnyTrue( NoIntersection ) ) + { + // Store the x-component to *pDist + XMStoreFloat( &Dist, t_min ); + return true; + } + + Dist = 0.f; + return false; +} + + +//----------------------------------------------------------------------------- +// Test an oriented box vs 6 planes (typically forming a frustum). +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingOrientedBox::ContainedBy( FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2, + GXMVECTOR Plane3, CXMVECTOR Plane4, CXMVECTOR Plane5 ) const +{ + // Load the box. + XMVECTOR vCenter = XMLoadFloat3( &Center ); + XMVECTOR vExtents = XMLoadFloat3( &Extents ); + XMVECTOR BoxOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( BoxOrientation ) ); + + // Set w of the center to one so we can dot4 with a plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() ); + + // Build the 3x3 rotation matrix that defines the box axes. + XMMATRIX R = XMMatrixRotationQuaternion( BoxOrientation ); + + XMVECTOR Outside, Inside; + + // Test against each plane. + DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane0, Outside, Inside ); + + XMVECTOR AnyOutside = Outside; + XMVECTOR AllInside = Inside; + + DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane1, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane2, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane3, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane4, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectOrientedBoxPlane( vCenter, vExtents, R.r[0], R.r[1], R.r[2], Plane5, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + // If the box is outside any plane it is outside. + if ( XMVector4EqualInt( AnyOutside, XMVectorTrueInt() ) ) + return DISJOINT; + + // If the box is inside all planes it is inside. + if ( XMVector4EqualInt( AllInside, XMVectorTrueInt() ) ) + return CONTAINS; + + // The box is not inside all planes or outside a plane, it may intersect. + return INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Create oriented bounding box from axis-aligned bounding box +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingOrientedBox::CreateFromBoundingBox( BoundingOrientedBox& Out, const BoundingBox& box ) +{ + Out.Center = box.Center; + Out.Extents = box.Extents; + Out.Orientation = XMFLOAT4( 0.f, 0.f, 0.f, 1.f ); +} + + +//----------------------------------------------------------------------------- +// Find the approximate minimum oriented bounding box containing a set of +// points. Exact computation of minimum oriented bounding box is possible but +// is slower and requires a more complex algorithm. +// The algorithm works by computing the inertia tensor of the points and then +// using the eigenvectors of the intertia tensor as the axes of the box. +// Computing the intertia tensor of the convex hull of the points will usually +// result in better bounding box but the computation is more complex. +// Exact computation of the minimum oriented bounding box is possible but the +// best know algorithm is O(N^3) and is significanly more complex to implement. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingOrientedBox::CreateFromPoints( BoundingOrientedBox& Out, size_t Count, const XMFLOAT3* pPoints, size_t Stride ) +{ + assert( Count > 0 ); + assert( pPoints != 0 ); + + XMVECTOR CenterOfMass = XMVectorZero(); + + // Compute the center of mass and inertia tensor of the points. + for( size_t i = 0; i < Count; ++i ) + { + XMVECTOR Point = XMLoadFloat3( reinterpret_cast<const XMFLOAT3*>( reinterpret_cast<const uint8_t*>(pPoints) + i * Stride ) ); + + CenterOfMass += Point; + } + + CenterOfMass *= XMVectorReciprocal( XMVectorReplicate( float( Count ) ) ); + + // Compute the inertia tensor of the points around the center of mass. + // Using the center of mass is not strictly necessary, but will hopefully + // improve the stability of finding the eigenvectors. + XMVECTOR XX_YY_ZZ = XMVectorZero(); + XMVECTOR XY_XZ_YZ = XMVectorZero(); + + for( size_t i = 0; i < Count; ++i ) + { + XMVECTOR Point = XMLoadFloat3( reinterpret_cast<const XMFLOAT3*>( reinterpret_cast<const uint8_t*>(pPoints) + i * Stride ) ) - CenterOfMass; + + XX_YY_ZZ += Point * Point; + + XMVECTOR XXY = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_W>( Point ); + XMVECTOR YZZ = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_W>( Point ); + + XY_XZ_YZ += XXY * YZZ; + } + + XMVECTOR v1, v2, v3; + + // Compute the eigenvectors of the inertia tensor. + DirectX::Internal::CalculateEigenVectorsFromCovarianceMatrix( XMVectorGetX( XX_YY_ZZ ), XMVectorGetY( XX_YY_ZZ ), + XMVectorGetZ( XX_YY_ZZ ), + XMVectorGetX( XY_XZ_YZ ), XMVectorGetY( XY_XZ_YZ ), + XMVectorGetZ( XY_XZ_YZ ), + &v1, &v2, &v3 ); + + // Put them in a matrix. + XMMATRIX R; + + R.r[0] = XMVectorSetW( v1, 0.f ); + R.r[1] = XMVectorSetW( v2, 0.f ); + R.r[2] = XMVectorSetW( v3, 0.f ); + R.r[3] = g_XMIdentityR3.v; + + // Multiply by -1 to convert the matrix into a right handed coordinate + // system (Det ~= 1) in case the eigenvectors form a left handed + // coordinate system (Det ~= -1) because XMQuaternionRotationMatrix only + // works on right handed matrices. + XMVECTOR Det = XMMatrixDeterminant( R ); + + if( XMVector4Less( Det, XMVectorZero() ) ) + { + R.r[0] *= g_XMNegativeOne.v; + R.r[1] *= g_XMNegativeOne.v; + R.r[2] *= g_XMNegativeOne.v; + } + + // Get the rotation quaternion from the matrix. + XMVECTOR vOrientation = XMQuaternionRotationMatrix( R ); + + // Make sure it is normal (in case the vectors are slightly non-orthogonal). + vOrientation = XMQuaternionNormalize( vOrientation ); + + // Rebuild the rotation matrix from the quaternion. + R = XMMatrixRotationQuaternion( vOrientation ); + + // Build the rotation into the rotated space. + XMMATRIX InverseR = XMMatrixTranspose( R ); + + // Find the minimum OBB using the eigenvectors as the axes. + XMVECTOR vMin, vMax; + + vMin = vMax = XMVector3TransformNormal( XMLoadFloat3( pPoints ), InverseR ); + + for( size_t i = 1; i < Count; ++i ) + { + XMVECTOR Point = XMVector3TransformNormal( XMLoadFloat3( reinterpret_cast<const XMFLOAT3*>( reinterpret_cast<const uint8_t*>(pPoints) + i * Stride ) ), + InverseR ); + + vMin = XMVectorMin( vMin, Point ); + vMax = XMVectorMax( vMax, Point ); + } + + // Rotate the center into world space. + XMVECTOR vCenter = ( vMin + vMax ) * 0.5f; + vCenter = XMVector3TransformNormal( vCenter, R ); + + // Store center, extents, and orientation. + XMStoreFloat3( &Out.Center, vCenter ); + XMStoreFloat3( &Out.Extents, ( vMax - vMin ) * 0.5f ); + XMStoreFloat4( &Out.Orientation, vOrientation ); +} + + +/**************************************************************************** + * + * BoundingFrustum + * + ****************************************************************************/ + +//----------------------------------------------------------------------------- +// Transform a frustum by an angle preserving transform. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingFrustum::Transform( BoundingFrustum& Out, CXMMATRIX M ) const +{ + // Load the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Composite the frustum rotation and the transform rotation. + XMVECTOR Rotation = XMQuaternionRotationMatrix( M ); + vOrientation = XMQuaternionMultiply( vOrientation, Rotation ); + + // Transform the center. + vOrigin = XMVector3Transform( vOrigin, M ); + + // Store the frustum. + XMStoreFloat3( &Out.Origin, vOrigin ); + XMStoreFloat4( &Out.Orientation, vOrientation ); + + // Scale the near and far distances (the slopes remain the same). + XMVECTOR dX = XMVector3Dot( M.r[0], M.r[0] ); + XMVECTOR dY = XMVector3Dot( M.r[1], M.r[1] ); + XMVECTOR dZ = XMVector3Dot( M.r[2], M.r[2] ); + + XMVECTOR d = XMVectorMax( dX, XMVectorMax( dY, dZ ) ); + float Scale = sqrtf( XMVectorGetX(d) ); + + Out.Near = Near * Scale; + Out.Far = Far * Scale; + + // Copy the slopes. + Out.RightSlope = RightSlope; + Out.LeftSlope = LeftSlope; + Out.TopSlope = TopSlope; + Out.BottomSlope = BottomSlope; +} + +_Use_decl_annotations_ +inline void BoundingFrustum::Transform( BoundingFrustum& Out, float Scale, FXMVECTOR Rotation, FXMVECTOR Translation ) const +{ + assert( DirectX::Internal::XMQuaternionIsUnit( Rotation ) ); + + // Load the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Composite the frustum rotation and the transform rotation. + vOrientation = XMQuaternionMultiply( vOrientation, Rotation ); + + // Transform the origin. + vOrigin = XMVector3Rotate( vOrigin * XMVectorReplicate( Scale ), Rotation ) + Translation; + + // Store the frustum. + XMStoreFloat3( &Out.Origin, vOrigin ); + XMStoreFloat4( &Out.Orientation, vOrientation ); + + // Scale the near and far distances (the slopes remain the same). + Out.Near = Near * Scale; + Out.Far = Far * Scale; + + // Copy the slopes. + Out.RightSlope = RightSlope; + Out.LeftSlope = LeftSlope; + Out.TopSlope = TopSlope; + Out.BottomSlope = BottomSlope; +} + + +//----------------------------------------------------------------------------- +// Get the corner points of the frustum +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingFrustum::GetCorners( XMFLOAT3* Corners ) const +{ + assert( Corners != 0 ); + + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Build the corners of the frustum. + XMVECTOR vRightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR vRightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR vLeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR vLeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR vNear = XMVectorReplicatePtr( &Near ); + XMVECTOR vFar = XMVectorReplicatePtr( &Far ); + + // Returns 8 corners position of bounding frustum. + // Near Far + // 0----1 4----5 + // | | | | + // | | | | + // 3----2 7----6 + + XMVECTOR vCorners[CORNER_COUNT]; + vCorners[0] = vLeftTop * vNear; + vCorners[1] = vRightTop * vNear; + vCorners[2] = vRightBottom * vNear; + vCorners[3] = vLeftBottom * vNear; + vCorners[4] = vLeftTop * vFar; + vCorners[5] = vRightTop * vFar; + vCorners[6] = vRightBottom * vFar; + vCorners[7] = vLeftBottom * vFar; + + for( size_t i=0; i < CORNER_COUNT; ++i ) + { + XMVECTOR C = XMVector3Rotate( vCorners[i], vOrientation ) + vOrigin; + XMStoreFloat3( &Corners[i], C ); + } +} + + +//----------------------------------------------------------------------------- +// Point in frustum test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingFrustum::Contains( FXMVECTOR Point ) const +{ + // Build frustum planes. + XMVECTOR Planes[6]; + Planes[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); + Planes[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); + Planes[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + Planes[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + Planes[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + Planes[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + + // Load origin and orientation. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Transform point into local space of frustum. + XMVECTOR TPoint = XMVector3InverseRotate( Point - vOrigin, vOrientation ); + + // Set w to one. + TPoint = XMVectorInsert<0, 0, 0, 0, 1>( TPoint, XMVectorSplatOne() ); + + XMVECTOR Zero = XMVectorZero(); + XMVECTOR Outside = Zero; + + // Test point against each plane of the frustum. + for( size_t i = 0; i < 6; ++i ) + { + XMVECTOR Dot = XMVector4Dot( TPoint, Planes[i] ); + Outside = XMVectorOrInt( Outside, XMVectorGreater( Dot, Zero ) ); + } + + return XMVector4NotEqualInt( Outside, XMVectorTrueInt() ) ? CONTAINS : DISJOINT; +} + + +//----------------------------------------------------------------------------- +// Triangle vs frustum test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingFrustum::Contains( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + // Create 6 planes (do it inline to encourage use of registers) + XMVECTOR NearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); + NearPlane = DirectX::Internal::XMPlaneTransform( NearPlane, vOrientation, vOrigin ); + NearPlane = XMPlaneNormalize( NearPlane ); + + XMVECTOR FarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); + FarPlane = DirectX::Internal::XMPlaneTransform( FarPlane, vOrientation, vOrigin ); + FarPlane = XMPlaneNormalize( FarPlane ); + + XMVECTOR RightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + RightPlane = DirectX::Internal::XMPlaneTransform( RightPlane, vOrientation, vOrigin ); + RightPlane = XMPlaneNormalize( RightPlane ); + + XMVECTOR LeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + LeftPlane = DirectX::Internal::XMPlaneTransform( LeftPlane, vOrientation, vOrigin ); + LeftPlane = XMPlaneNormalize( LeftPlane ); + + XMVECTOR TopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + TopPlane = DirectX::Internal::XMPlaneTransform( TopPlane, vOrientation, vOrigin ); + TopPlane = XMPlaneNormalize( TopPlane ); + + XMVECTOR BottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + BottomPlane = DirectX::Internal::XMPlaneTransform( BottomPlane, vOrientation, vOrigin ); + BottomPlane = XMPlaneNormalize( BottomPlane ); + + return TriangleTests::ContainedBy( V0, V1, V2, NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane ); +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingFrustum::Contains( const BoundingSphere& sh ) const +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + // Create 6 planes (do it inline to encourage use of registers) + XMVECTOR NearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); + NearPlane = DirectX::Internal::XMPlaneTransform( NearPlane, vOrientation, vOrigin ); + NearPlane = XMPlaneNormalize( NearPlane ); + + XMVECTOR FarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); + FarPlane = DirectX::Internal::XMPlaneTransform( FarPlane, vOrientation, vOrigin ); + FarPlane = XMPlaneNormalize( FarPlane ); + + XMVECTOR RightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + RightPlane = DirectX::Internal::XMPlaneTransform( RightPlane, vOrientation, vOrigin ); + RightPlane = XMPlaneNormalize( RightPlane ); + + XMVECTOR LeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + LeftPlane = DirectX::Internal::XMPlaneTransform( LeftPlane, vOrientation, vOrigin ); + LeftPlane = XMPlaneNormalize( LeftPlane ); + + XMVECTOR TopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + TopPlane = DirectX::Internal::XMPlaneTransform( TopPlane, vOrientation, vOrigin ); + TopPlane = XMPlaneNormalize( TopPlane ); + + XMVECTOR BottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + BottomPlane = DirectX::Internal::XMPlaneTransform( BottomPlane, vOrientation, vOrigin ); + BottomPlane = XMPlaneNormalize( BottomPlane ); + + return sh.ContainedBy( NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane ); +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingFrustum::Contains( const BoundingBox& box ) const +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + // Create 6 planes (do it inline to encourage use of registers) + XMVECTOR NearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); + NearPlane = DirectX::Internal::XMPlaneTransform( NearPlane, vOrientation, vOrigin ); + NearPlane = XMPlaneNormalize( NearPlane ); + + XMVECTOR FarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); + FarPlane = DirectX::Internal::XMPlaneTransform( FarPlane, vOrientation, vOrigin ); + FarPlane = XMPlaneNormalize( FarPlane ); + + XMVECTOR RightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + RightPlane = DirectX::Internal::XMPlaneTransform( RightPlane, vOrientation, vOrigin ); + RightPlane = XMPlaneNormalize( RightPlane ); + + XMVECTOR LeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + LeftPlane = DirectX::Internal::XMPlaneTransform( LeftPlane, vOrientation, vOrigin ); + LeftPlane = XMPlaneNormalize( LeftPlane ); + + XMVECTOR TopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + TopPlane = DirectX::Internal::XMPlaneTransform( TopPlane, vOrientation, vOrigin ); + TopPlane = XMPlaneNormalize( TopPlane ); + + XMVECTOR BottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + BottomPlane = DirectX::Internal::XMPlaneTransform( BottomPlane, vOrientation, vOrigin ); + BottomPlane = XMPlaneNormalize( BottomPlane ); + + return box.ContainedBy( NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane ); +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingFrustum::Contains( const BoundingOrientedBox& box ) const +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + // Create 6 planes (do it inline to encourage use of registers) + XMVECTOR NearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); + NearPlane = DirectX::Internal::XMPlaneTransform( NearPlane, vOrientation, vOrigin ); + NearPlane = XMPlaneNormalize( NearPlane ); + + XMVECTOR FarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); + FarPlane = DirectX::Internal::XMPlaneTransform( FarPlane, vOrientation, vOrigin ); + FarPlane = XMPlaneNormalize( FarPlane ); + + XMVECTOR RightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + RightPlane = DirectX::Internal::XMPlaneTransform( RightPlane, vOrientation, vOrigin ); + RightPlane = XMPlaneNormalize( RightPlane ); + + XMVECTOR LeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + LeftPlane = DirectX::Internal::XMPlaneTransform( LeftPlane, vOrientation, vOrigin ); + LeftPlane = XMPlaneNormalize( LeftPlane ); + + XMVECTOR TopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + TopPlane = DirectX::Internal::XMPlaneTransform( TopPlane, vOrientation, vOrigin ); + TopPlane = XMPlaneNormalize( TopPlane ); + + XMVECTOR BottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + BottomPlane = DirectX::Internal::XMPlaneTransform( BottomPlane, vOrientation, vOrigin ); + BottomPlane = XMPlaneNormalize( BottomPlane ); + + return box.ContainedBy( NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane ); +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingFrustum::Contains( const BoundingFrustum& fr ) const +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + // Create 6 planes (do it inline to encourage use of registers) + XMVECTOR NearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); + NearPlane = DirectX::Internal::XMPlaneTransform( NearPlane, vOrientation, vOrigin ); + NearPlane = XMPlaneNormalize( NearPlane ); + + XMVECTOR FarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); + FarPlane = DirectX::Internal::XMPlaneTransform( FarPlane, vOrientation, vOrigin ); + FarPlane = XMPlaneNormalize( FarPlane ); + + XMVECTOR RightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + RightPlane = DirectX::Internal::XMPlaneTransform( RightPlane, vOrientation, vOrigin ); + RightPlane = XMPlaneNormalize( RightPlane ); + + XMVECTOR LeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + LeftPlane = DirectX::Internal::XMPlaneTransform( LeftPlane, vOrientation, vOrigin ); + LeftPlane = XMPlaneNormalize( LeftPlane ); + + XMVECTOR TopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + TopPlane = DirectX::Internal::XMPlaneTransform( TopPlane, vOrientation, vOrigin ); + TopPlane = XMPlaneNormalize( TopPlane ); + + XMVECTOR BottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + BottomPlane = DirectX::Internal::XMPlaneTransform( BottomPlane, vOrientation, vOrigin ); + BottomPlane = XMPlaneNormalize( BottomPlane ); + + return fr.ContainedBy( NearPlane, FarPlane, RightPlane, LeftPlane, TopPlane, BottomPlane ); +} + + +//----------------------------------------------------------------------------- +// Exact sphere vs frustum test. The algorithm first checks the sphere against +// the planes of the frustum, then if the plane checks were indeterminate finds +// the nearest feature (plane, line, point) on the frustum to the center of the +// sphere and compares the distance to the nearest feature to the radius of the +// sphere +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingFrustum::Intersects( const BoundingSphere& sh ) const +{ + XMVECTOR Zero = XMVectorZero(); + + // Build the frustum planes. + XMVECTOR Planes[6]; + Planes[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); + Planes[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); + Planes[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + Planes[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + Planes[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + Planes[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + + // Normalize the planes so we can compare to the sphere radius. + Planes[2] = XMVector3Normalize( Planes[2] ); + Planes[3] = XMVector3Normalize( Planes[3] ); + Planes[4] = XMVector3Normalize( Planes[4] ); + Planes[5] = XMVector3Normalize( Planes[5] ); + + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Load the sphere. + XMVECTOR vCenter = XMLoadFloat3( &sh.Center ); + XMVECTOR vRadius = XMVectorReplicatePtr( &sh.Radius ); + + // Transform the center of the sphere into the local space of frustum. + vCenter = XMVector3InverseRotate( vCenter - vOrigin, vOrientation ); + + // Set w of the center to one so we can dot4 with the plane. + vCenter = XMVectorInsert<0, 0, 0, 0, 1>( vCenter, XMVectorSplatOne() ); + + // Check against each plane of the frustum. + XMVECTOR Outside = XMVectorFalseInt(); + XMVECTOR InsideAll = XMVectorTrueInt(); + XMVECTOR CenterInsideAll = XMVectorTrueInt(); + + XMVECTOR Dist[6]; + + for( size_t i = 0; i < 6; ++i ) + { + Dist[i] = XMVector4Dot( vCenter, Planes[i] ); + + // Outside the plane? + Outside = XMVectorOrInt( Outside, XMVectorGreater( Dist[i], vRadius ) ); + + // Fully inside the plane? + InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( Dist[i], -vRadius ) ); + + // Check if the center is inside the plane. + CenterInsideAll = XMVectorAndInt( CenterInsideAll, XMVectorLessOrEqual( Dist[i], Zero ) ); + } + + // If the sphere is outside any of the planes it is outside. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return false; + + // If the sphere is inside all planes it is fully inside. + if ( XMVector4EqualInt( InsideAll, XMVectorTrueInt() ) ) + return true; + + // If the center of the sphere is inside all planes and the sphere intersects + // one or more planes then it must intersect. + if ( XMVector4EqualInt( CenterInsideAll, XMVectorTrueInt() ) ) + return true; + + // The sphere may be outside the frustum or intersecting the frustum. + // Find the nearest feature (face, edge, or corner) on the frustum + // to the sphere. + + // The faces adjacent to each face are: + static const size_t adjacent_faces[6][4] = + { + { 2, 3, 4, 5 }, // 0 + { 2, 3, 4, 5 }, // 1 + { 0, 1, 4, 5 }, // 2 + { 0, 1, 4, 5 }, // 3 + { 0, 1, 2, 3 }, // 4 + { 0, 1, 2, 3 } + }; // 5 + + XMVECTOR Intersects = XMVectorFalseInt(); + + // Check to see if the nearest feature is one of the planes. + for( size_t i = 0; i < 6; ++i ) + { + // Find the nearest point on the plane to the center of the sphere. + XMVECTOR Point = vCenter - (Planes[i] * Dist[i]); + + // Set w of the point to one. + Point = XMVectorInsert<0, 0, 0, 0, 1>( Point, XMVectorSplatOne() ); + + // If the point is inside the face (inside the adjacent planes) then + // this plane is the nearest feature. + XMVECTOR InsideFace = XMVectorTrueInt(); + + for ( size_t j = 0; j < 4; j++ ) + { + size_t plane_index = adjacent_faces[i][j]; + + InsideFace = XMVectorAndInt( InsideFace, + XMVectorLessOrEqual( XMVector4Dot( Point, Planes[plane_index] ), Zero ) ); + } + + // Since we have already checked distance from the plane we know that the + // sphere must intersect if this plane is the nearest feature. + Intersects = XMVectorOrInt( Intersects, + XMVectorAndInt( XMVectorGreater( Dist[i], Zero ), InsideFace ) ); + } + + if ( XMVector4EqualInt( Intersects, XMVectorTrueInt() ) ) + return true; + + // Build the corners of the frustum. + XMVECTOR vRightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR vRightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR vLeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR vLeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR vNear = XMVectorReplicatePtr( &Near ); + XMVECTOR vFar = XMVectorReplicatePtr( &Far ); + + XMVECTOR Corners[CORNER_COUNT]; + Corners[0] = vRightTop * vNear; + Corners[1] = vRightBottom * vNear; + Corners[2] = vLeftTop * vNear; + Corners[3] = vLeftBottom * vNear; + Corners[4] = vRightTop * vFar; + Corners[5] = vRightBottom * vFar; + Corners[6] = vLeftTop * vFar; + Corners[7] = vLeftBottom * vFar; + + // The Edges are: + static const size_t edges[12][2] = + { + { 0, 1 }, { 2, 3 }, { 0, 2 }, { 1, 3 }, // Near plane + { 4, 5 }, { 6, 7 }, { 4, 6 }, { 5, 7 }, // Far plane + { 0, 4 }, { 1, 5 }, { 2, 6 }, { 3, 7 }, + }; // Near to far + + XMVECTOR RadiusSq = vRadius * vRadius; + + // Check to see if the nearest feature is one of the edges (or corners). + for( size_t i = 0; i < 12; ++i ) + { + size_t ei0 = edges[i][0]; + size_t ei1 = edges[i][1]; + + // Find the nearest point on the edge to the center of the sphere. + // The corners of the frustum are included as the endpoints of the edges. + XMVECTOR Point = DirectX::Internal::PointOnLineSegmentNearestPoint( Corners[ei0], Corners[ei1], vCenter ); + + XMVECTOR Delta = vCenter - Point; + + XMVECTOR DistSq = XMVector3Dot( Delta, Delta ); + + // If the distance to the center of the sphere to the point is less than + // the radius of the sphere then it must intersect. + Intersects = XMVectorOrInt( Intersects, XMVectorLessOrEqual( DistSq, RadiusSq ) ); + } + + if ( XMVector4EqualInt( Intersects, XMVectorTrueInt() ) ) + return true; + + // The sphere must be outside the frustum. + return false; +} + + +//----------------------------------------------------------------------------- +// Exact axis aligned box vs frustum test. Constructs an oriented box and uses +// the oriented box vs frustum test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingFrustum::Intersects( const BoundingBox& box ) const +{ + // Make the axis aligned box oriented and do an OBB vs frustum test. + BoundingOrientedBox obox( box.Center, box.Extents, XMFLOAT4( 0.f, 0.f, 0.f, 1.f ) ); + return Intersects( obox ); +} + + +//----------------------------------------------------------------------------- +// Exact oriented box vs frustum test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingFrustum::Intersects( const BoundingOrientedBox& box ) const +{ + static const XMVECTORI32 SelectY = + { + XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0 + }; + static const XMVECTORI32 SelectZ = + { + XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 + }; + + XMVECTOR Zero = XMVectorZero(); + + // Build the frustum planes. + XMVECTOR Planes[6]; + Planes[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); + Planes[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); + Planes[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + Planes[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + Planes[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + Planes[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR FrustumOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( FrustumOrientation ) ); + + // Load the box. + XMVECTOR Center = XMLoadFloat3( &box.Center ); + XMVECTOR Extents = XMLoadFloat3( &box.Extents ); + XMVECTOR BoxOrientation = XMLoadFloat4( &box.Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( BoxOrientation ) ); + + // Transform the oriented box into the space of the frustum in order to + // minimize the number of transforms we have to do. + Center = XMVector3InverseRotate( Center - vOrigin, FrustumOrientation ); + BoxOrientation = XMQuaternionMultiply( BoxOrientation, XMQuaternionConjugate( FrustumOrientation ) ); + + // Set w of the center to one so we can dot4 with the plane. + Center = XMVectorInsert<0, 0, 0, 0, 1>( Center, XMVectorSplatOne() ); + + // Build the 3x3 rotation matrix that defines the box axes. + XMMATRIX R = XMMatrixRotationQuaternion( BoxOrientation ); + + // Check against each plane of the frustum. + XMVECTOR Outside = XMVectorFalseInt(); + XMVECTOR InsideAll = XMVectorTrueInt(); + XMVECTOR CenterInsideAll = XMVectorTrueInt(); + + for( size_t i = 0; i < 6; ++i ) + { + // Compute the distance to the center of the box. + XMVECTOR Dist = XMVector4Dot( Center, Planes[i] ); + + // Project the axes of the box onto the normal of the plane. Half the + // length of the projection (sometime called the "radius") is equal to + // h(u) * abs(n dot b(u))) + h(v) * abs(n dot b(v)) + h(w) * abs(n dot b(w)) + // where h(i) are extents of the box, n is the plane normal, and b(i) are the + // axes of the box. + XMVECTOR Radius = XMVector3Dot( Planes[i], R.r[0] ); + Radius = XMVectorSelect( Radius, XMVector3Dot( Planes[i], R.r[1] ), SelectY ); + Radius = XMVectorSelect( Radius, XMVector3Dot( Planes[i], R.r[2] ), SelectZ ); + Radius = XMVector3Dot( Extents, XMVectorAbs( Radius ) ); + + // Outside the plane? + Outside = XMVectorOrInt( Outside, XMVectorGreater( Dist, Radius ) ); + + // Fully inside the plane? + InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( Dist, -Radius ) ); + + // Check if the center is inside the plane. + CenterInsideAll = XMVectorAndInt( CenterInsideAll, XMVectorLessOrEqual( Dist, Zero ) ); + } + + // If the box is outside any of the planes it is outside. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return false; + + // If the box is inside all planes it is fully inside. + if ( XMVector4EqualInt( InsideAll, XMVectorTrueInt() ) ) + return true; + + // If the center of the box is inside all planes and the box intersects + // one or more planes then it must intersect. + if ( XMVector4EqualInt( CenterInsideAll, XMVectorTrueInt() ) ) + return true; + + // Build the corners of the frustum. + XMVECTOR vRightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR vRightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR vLeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR vLeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR vNear = XMVectorReplicatePtr( &Near ); + XMVECTOR vFar = XMVectorReplicatePtr( &Far ); + + XMVECTOR Corners[CORNER_COUNT]; + Corners[0] = vRightTop * vNear; + Corners[1] = vRightBottom * vNear; + Corners[2] = vLeftTop * vNear; + Corners[3] = vLeftBottom * vNear; + Corners[4] = vRightTop * vFar; + Corners[5] = vRightBottom * vFar; + Corners[6] = vLeftTop * vFar; + Corners[7] = vLeftBottom * vFar; + + // Test against box axes (3) + { + // Find the min/max values of the projection of the frustum onto each axis. + XMVECTOR FrustumMin, FrustumMax; + + FrustumMin = XMVector3Dot( Corners[0], R.r[0] ); + FrustumMin = XMVectorSelect( FrustumMin, XMVector3Dot( Corners[0], R.r[1] ), SelectY ); + FrustumMin = XMVectorSelect( FrustumMin, XMVector3Dot( Corners[0], R.r[2] ), SelectZ ); + FrustumMax = FrustumMin; + + for( size_t i = 1; i < BoundingOrientedBox::CORNER_COUNT; ++i ) + { + XMVECTOR Temp = XMVector3Dot( Corners[i], R.r[0] ); + Temp = XMVectorSelect( Temp, XMVector3Dot( Corners[i], R.r[1] ), SelectY ); + Temp = XMVectorSelect( Temp, XMVector3Dot( Corners[i], R.r[2] ), SelectZ ); + + FrustumMin = XMVectorMin( FrustumMin, Temp ); + FrustumMax = XMVectorMax( FrustumMax, Temp ); + } + + // Project the center of the box onto the axes. + XMVECTOR BoxDist = XMVector3Dot( Center, R.r[0] ); + BoxDist = XMVectorSelect( BoxDist, XMVector3Dot( Center, R.r[1] ), SelectY ); + BoxDist = XMVectorSelect( BoxDist, XMVector3Dot( Center, R.r[2] ), SelectZ ); + + // The projection of the box onto the axis is just its Center and Extents. + // if (min > box_max || max < box_min) reject; + XMVECTOR Result = XMVectorOrInt( XMVectorGreater( FrustumMin, BoxDist + Extents ), + XMVectorLess( FrustumMax, BoxDist - Extents ) ); + + if( DirectX::Internal::XMVector3AnyTrue( Result ) ) + return false; + } + + // Test against edge/edge axes (3*6). + XMVECTOR FrustumEdgeAxis[6]; + + FrustumEdgeAxis[0] = vRightTop; + FrustumEdgeAxis[1] = vRightBottom; + FrustumEdgeAxis[2] = vLeftTop; + FrustumEdgeAxis[3] = vLeftBottom; + FrustumEdgeAxis[4] = vRightTop - vLeftTop; + FrustumEdgeAxis[5] = vLeftBottom - vLeftTop; + + for( size_t i = 0; i < 3; ++i ) + { + for( size_t j = 0; j < 6; j++ ) + { + // Compute the axis we are going to test. + XMVECTOR Axis = XMVector3Cross( R.r[i], FrustumEdgeAxis[j] ); + + // Find the min/max values of the projection of the frustum onto the axis. + XMVECTOR FrustumMin, FrustumMax; + + FrustumMin = FrustumMax = XMVector3Dot( Axis, Corners[0] ); + + for( size_t k = 1; k < CORNER_COUNT; k++ ) + { + XMVECTOR Temp = XMVector3Dot( Axis, Corners[k] ); + FrustumMin = XMVectorMin( FrustumMin, Temp ); + FrustumMax = XMVectorMax( FrustumMax, Temp ); + } + + // Project the center of the box onto the axis. + XMVECTOR Dist = XMVector3Dot( Center, Axis ); + + // Project the axes of the box onto the axis to find the "radius" of the box. + XMVECTOR Radius = XMVector3Dot( Axis, R.r[0] ); + Radius = XMVectorSelect( Radius, XMVector3Dot( Axis, R.r[1] ), SelectY ); + Radius = XMVectorSelect( Radius, XMVector3Dot( Axis, R.r[2] ), SelectZ ); + Radius = XMVector3Dot( Extents, XMVectorAbs( Radius ) ); + + // if (center > max + radius || center < min - radius) reject; + Outside = XMVectorOrInt( Outside, XMVectorGreater( Dist, FrustumMax + Radius ) ); + Outside = XMVectorOrInt( Outside, XMVectorLess( Dist, FrustumMin - Radius ) ); + } + } + + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return false; + + // If we did not find a separating plane then the box must intersect the frustum. + return true; +} + + +//----------------------------------------------------------------------------- +// Exact frustum vs frustum test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingFrustum::Intersects( const BoundingFrustum& fr ) const +{ + // Load origin and orientation of frustum B. + XMVECTOR OriginB = XMLoadFloat3( &Origin ); + XMVECTOR OrientationB = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( OrientationB ) ); + + // Build the planes of frustum B. + XMVECTOR AxisB[6]; + AxisB[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, 0.0f ); + AxisB[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, 0.0f ); + AxisB[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + AxisB[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + AxisB[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + AxisB[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + + XMVECTOR PlaneDistB[6]; + PlaneDistB[0] = -XMVectorReplicatePtr( &Near ); + PlaneDistB[1] = XMVectorReplicatePtr( &Far ); + PlaneDistB[2] = XMVectorZero(); + PlaneDistB[3] = XMVectorZero(); + PlaneDistB[4] = XMVectorZero(); + PlaneDistB[5] = XMVectorZero(); + + // Load origin and orientation of frustum A. + XMVECTOR OriginA = XMLoadFloat3( &fr.Origin ); + XMVECTOR OrientationA = XMLoadFloat4( &fr.Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( OrientationA ) ); + + // Transform frustum A into the space of the frustum B in order to + // minimize the number of transforms we have to do. + OriginA = XMVector3InverseRotate( OriginA - OriginB, OrientationB ); + OrientationA = XMQuaternionMultiply( OrientationA, XMQuaternionConjugate( OrientationB ) ); + + // Build the corners of frustum A (in the local space of B). + XMVECTOR RightTopA = XMVectorSet( fr.RightSlope, fr.TopSlope, 1.0f, 0.0f ); + XMVECTOR RightBottomA = XMVectorSet( fr.RightSlope, fr.BottomSlope, 1.0f, 0.0f ); + XMVECTOR LeftTopA = XMVectorSet(fr.LeftSlope,fr.TopSlope, 1.0f, 0.0f ); + XMVECTOR LeftBottomA = XMVectorSet( fr.LeftSlope, fr.BottomSlope, 1.0f, 0.0f ); + XMVECTOR NearA = XMVectorReplicatePtr( &fr.Near ); + XMVECTOR FarA = XMVectorReplicatePtr( &fr.Far ); + + RightTopA = XMVector3Rotate( RightTopA, OrientationA ); + RightBottomA = XMVector3Rotate( RightBottomA, OrientationA ); + LeftTopA = XMVector3Rotate( LeftTopA, OrientationA ); + LeftBottomA = XMVector3Rotate( LeftBottomA, OrientationA ); + + XMVECTOR CornersA[CORNER_COUNT]; + CornersA[0] = OriginA + RightTopA * NearA; + CornersA[1] = OriginA + RightBottomA * NearA; + CornersA[2] = OriginA + LeftTopA * NearA; + CornersA[3] = OriginA + LeftBottomA * NearA; + CornersA[4] = OriginA + RightTopA * FarA; + CornersA[5] = OriginA + RightBottomA * FarA; + CornersA[6] = OriginA + LeftTopA * FarA; + CornersA[7] = OriginA + LeftBottomA * FarA; + + // Check frustum A against each plane of frustum B. + XMVECTOR Outside = XMVectorFalseInt(); + XMVECTOR InsideAll = XMVectorTrueInt(); + + for( size_t i = 0; i < 6; ++i ) + { + // Find the min/max projection of the frustum onto the plane normal. + XMVECTOR Min, Max; + + Min = Max = XMVector3Dot( AxisB[i], CornersA[0] ); + + for( size_t j = 1; j < CORNER_COUNT; j++ ) + { + XMVECTOR Temp = XMVector3Dot( AxisB[i], CornersA[j] ); + Min = XMVectorMin( Min, Temp ); + Max = XMVectorMax( Max, Temp ); + } + + // Outside the plane? + Outside = XMVectorOrInt( Outside, XMVectorGreater( Min, PlaneDistB[i] ) ); + + // Fully inside the plane? + InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( Max, PlaneDistB[i] ) ); + } + + // If the frustum A is outside any of the planes of frustum B it is outside. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return false; + + // If frustum A is inside all planes of frustum B it is fully inside. + if ( XMVector4EqualInt( InsideAll, XMVectorTrueInt() ) ) + return true; + + // Build the corners of frustum B. + XMVECTOR RightTopB = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR RightBottomB = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR LeftTopB = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR LeftBottomB = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR NearB = XMVectorReplicatePtr( &Near ); + XMVECTOR FarB = XMVectorReplicatePtr( &Far ); + + XMVECTOR CornersB[BoundingFrustum::CORNER_COUNT]; + CornersB[0] = RightTopB * NearB; + CornersB[1] = RightBottomB * NearB; + CornersB[2] = LeftTopB * NearB; + CornersB[3] = LeftBottomB * NearB; + CornersB[4] = RightTopB * FarB; + CornersB[5] = RightBottomB * FarB; + CornersB[6] = LeftTopB * FarB; + CornersB[7] = LeftBottomB * FarB; + + // Build the planes of frustum A (in the local space of B). + XMVECTOR AxisA[6]; + XMVECTOR PlaneDistA[6]; + + AxisA[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, 0.0f ); + AxisA[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, 0.0f ); + AxisA[2] = XMVectorSet( 1.0f, 0.0f, -fr.RightSlope, 0.0f ); + AxisA[3] = XMVectorSet( -1.0f, 0.0f, fr.LeftSlope, 0.0f ); + AxisA[4] = XMVectorSet( 0.0f, 1.0f, -fr.TopSlope, 0.0f ); + AxisA[5] = XMVectorSet( 0.0f, -1.0f, fr.BottomSlope, 0.0f ); + + AxisA[0] = XMVector3Rotate( AxisA[0], OrientationA ); + AxisA[1] = -AxisA[0]; + AxisA[2] = XMVector3Rotate( AxisA[2], OrientationA ); + AxisA[3] = XMVector3Rotate( AxisA[3], OrientationA ); + AxisA[4] = XMVector3Rotate( AxisA[4], OrientationA ); + AxisA[5] = XMVector3Rotate( AxisA[5], OrientationA ); + + PlaneDistA[0] = XMVector3Dot( AxisA[0], CornersA[0] ); // Re-use corner on near plane. + PlaneDistA[1] = XMVector3Dot( AxisA[1], CornersA[4] ); // Re-use corner on far plane. + PlaneDistA[2] = XMVector3Dot( AxisA[2], OriginA ); + PlaneDistA[3] = XMVector3Dot( AxisA[3], OriginA ); + PlaneDistA[4] = XMVector3Dot( AxisA[4], OriginA ); + PlaneDistA[5] = XMVector3Dot( AxisA[5], OriginA ); + + // Check each axis of frustum A for a seperating plane (5). + for( size_t i = 0; i < 6; ++i ) + { + // Find the minimum projection of the frustum onto the plane normal. + XMVECTOR Min; + + Min = XMVector3Dot( AxisA[i], CornersB[0] ); + + for( size_t j = 1; j < CORNER_COUNT; j++ ) + { + XMVECTOR Temp = XMVector3Dot( AxisA[i], CornersB[j] ); + Min = XMVectorMin( Min, Temp ); + } + + // Outside the plane? + Outside = XMVectorOrInt( Outside, XMVectorGreater( Min, PlaneDistA[i] ) ); + } + + // If the frustum B is outside any of the planes of frustum A it is outside. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return false; + + // Check edge/edge axes (6 * 6). + XMVECTOR FrustumEdgeAxisA[6]; + FrustumEdgeAxisA[0] = RightTopA; + FrustumEdgeAxisA[1] = RightBottomA; + FrustumEdgeAxisA[2] = LeftTopA; + FrustumEdgeAxisA[3] = LeftBottomA; + FrustumEdgeAxisA[4] = RightTopA - LeftTopA; + FrustumEdgeAxisA[5] = LeftBottomA - LeftTopA; + + XMVECTOR FrustumEdgeAxisB[6]; + FrustumEdgeAxisB[0] = RightTopB; + FrustumEdgeAxisB[1] = RightBottomB; + FrustumEdgeAxisB[2] = LeftTopB; + FrustumEdgeAxisB[3] = LeftBottomB; + FrustumEdgeAxisB[4] = RightTopB - LeftTopB; + FrustumEdgeAxisB[5] = LeftBottomB - LeftTopB; + + for( size_t i = 0; i < 6; ++i ) + { + for( size_t j = 0; j < 6; j++ ) + { + // Compute the axis we are going to test. + XMVECTOR Axis = XMVector3Cross( FrustumEdgeAxisA[i], FrustumEdgeAxisB[j] ); + + // Find the min/max values of the projection of both frustums onto the axis. + XMVECTOR MinA, MaxA; + XMVECTOR MinB, MaxB; + + MinA = MaxA = XMVector3Dot( Axis, CornersA[0] ); + MinB = MaxB = XMVector3Dot( Axis, CornersB[0] ); + + for( size_t k = 1; k < CORNER_COUNT; k++ ) + { + XMVECTOR TempA = XMVector3Dot( Axis, CornersA[k] ); + MinA = XMVectorMin( MinA, TempA ); + MaxA = XMVectorMax( MaxA, TempA ); + + XMVECTOR TempB = XMVector3Dot( Axis, CornersB[k] ); + MinB = XMVectorMin( MinB, TempB ); + MaxB = XMVectorMax( MaxB, TempB ); + } + + // if (MinA > MaxB || MinB > MaxA) reject + Outside = XMVectorOrInt( Outside, XMVectorGreater( MinA, MaxB ) ); + Outside = XMVectorOrInt( Outside, XMVectorGreater( MinB, MaxA ) ); + } + } + + // If there is a seperating plane, then the frustums do not intersect. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return false; + + // If we did not find a separating plane then the frustums intersect. + return true; +} + + +//----------------------------------------------------------------------------- +// Triangle vs frustum test. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingFrustum::Intersects( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2 ) const +{ + // Build the frustum planes (NOTE: D is negated from the usual). + XMVECTOR Planes[6]; + Planes[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, -Near ); + Planes[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, Far ); + Planes[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + Planes[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + Planes[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + Planes[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Transform triangle into the local space of frustum. + XMVECTOR TV0 = XMVector3InverseRotate( V0 - vOrigin, vOrientation ); + XMVECTOR TV1 = XMVector3InverseRotate( V1 - vOrigin, vOrientation ); + XMVECTOR TV2 = XMVector3InverseRotate( V2 - vOrigin, vOrientation ); + + // Test each vertex of the triangle against the frustum planes. + XMVECTOR Outside = XMVectorFalseInt(); + XMVECTOR InsideAll = XMVectorTrueInt(); + + for( size_t i = 0; i < 6; ++i ) + { + XMVECTOR Dist0 = XMVector3Dot( TV0, Planes[i] ); + XMVECTOR Dist1 = XMVector3Dot( TV1, Planes[i] ); + XMVECTOR Dist2 = XMVector3Dot( TV2, Planes[i] ); + + XMVECTOR MinDist = XMVectorMin( Dist0, Dist1 ); + MinDist = XMVectorMin( MinDist, Dist2 ); + XMVECTOR MaxDist = XMVectorMax( Dist0, Dist1 ); + MaxDist = XMVectorMax( MaxDist, Dist2 ); + + XMVECTOR PlaneDist = XMVectorSplatW( Planes[i] ); + + // Outside the plane? + Outside = XMVectorOrInt( Outside, XMVectorGreater( MinDist, PlaneDist ) ); + + // Fully inside the plane? + InsideAll = XMVectorAndInt( InsideAll, XMVectorLessOrEqual( MaxDist, PlaneDist ) ); + } + + // If the triangle is outside any of the planes it is outside. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return false; + + // If the triangle is inside all planes it is fully inside. + if ( XMVector4EqualInt( InsideAll, XMVectorTrueInt() ) ) + return true; + + // Build the corners of the frustum. + XMVECTOR vRightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR vRightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR vLeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR vLeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR vNear = XMVectorReplicatePtr( &Near ); + XMVECTOR vFar = XMVectorReplicatePtr( &Far ); + + XMVECTOR Corners[CORNER_COUNT]; + Corners[0] = vRightTop * vNear; + Corners[1] = vRightBottom * vNear; + Corners[2] = vLeftTop * vNear; + Corners[3] = vLeftBottom * vNear; + Corners[4] = vRightTop * vFar; + Corners[5] = vRightBottom * vFar; + Corners[6] = vLeftTop * vFar; + Corners[7] = vLeftBottom * vFar; + + // Test the plane of the triangle. + XMVECTOR Normal = XMVector3Cross( V1 - V0, V2 - V0 ); + XMVECTOR Dist = XMVector3Dot( Normal, V0 ); + + XMVECTOR MinDist, MaxDist; + MinDist = MaxDist = XMVector3Dot( Corners[0], Normal ); + for( size_t i = 1; i < CORNER_COUNT; ++i ) + { + XMVECTOR Temp = XMVector3Dot( Corners[i], Normal ); + MinDist = XMVectorMin( MinDist, Temp ); + MaxDist = XMVectorMax( MaxDist, Temp ); + } + + Outside = XMVectorOrInt( XMVectorGreater( MinDist, Dist ), XMVectorLess( MaxDist, Dist ) ); + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return false; + + // Check the edge/edge axes (3*6). + XMVECTOR TriangleEdgeAxis[3]; + TriangleEdgeAxis[0] = V1 - V0; + TriangleEdgeAxis[1] = V2 - V1; + TriangleEdgeAxis[2] = V0 - V2; + + XMVECTOR FrustumEdgeAxis[6]; + FrustumEdgeAxis[0] = vRightTop; + FrustumEdgeAxis[1] = vRightBottom; + FrustumEdgeAxis[2] = vLeftTop; + FrustumEdgeAxis[3] = vLeftBottom; + FrustumEdgeAxis[4] = vRightTop - vLeftTop; + FrustumEdgeAxis[5] = vLeftBottom - vLeftTop; + + for( size_t i = 0; i < 3; ++i ) + { + for( size_t j = 0; j < 6; j++ ) + { + // Compute the axis we are going to test. + XMVECTOR Axis = XMVector3Cross( TriangleEdgeAxis[i], FrustumEdgeAxis[j] ); + + // Find the min/max of the projection of the triangle onto the axis. + XMVECTOR MinA, MaxA; + + XMVECTOR Dist0 = XMVector3Dot( V0, Axis ); + XMVECTOR Dist1 = XMVector3Dot( V1, Axis ); + XMVECTOR Dist2 = XMVector3Dot( V2, Axis ); + + MinA = XMVectorMin( Dist0, Dist1 ); + MinA = XMVectorMin( MinA, Dist2 ); + MaxA = XMVectorMax( Dist0, Dist1 ); + MaxA = XMVectorMax( MaxA, Dist2 ); + + // Find the min/max of the projection of the frustum onto the axis. + XMVECTOR MinB, MaxB; + + MinB = MaxB = XMVector3Dot( Axis, Corners[0] ); + + for( size_t k = 1; k < CORNER_COUNT; k++ ) + { + XMVECTOR Temp = XMVector3Dot( Axis, Corners[k] ); + MinB = XMVectorMin( MinB, Temp ); + MaxB = XMVectorMax( MaxB, Temp ); + } + + // if (MinA > MaxB || MinB > MaxA) reject; + Outside = XMVectorOrInt( Outside, XMVectorGreater( MinA, MaxB ) ); + Outside = XMVectorOrInt( Outside, XMVectorGreater( MinB, MaxA ) ); + } + } + + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return false; + + // If we did not find a separating plane then the triangle must intersect the frustum. + return true; +} + + +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline PlaneIntersectionType BoundingFrustum::Intersects( FXMVECTOR Plane ) const +{ + assert( DirectX::Internal::XMPlaneIsUnit( Plane ) ); + + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Set w of the origin to one so we can dot4 with a plane. + vOrigin = XMVectorInsert<0, 0, 0, 0, 1>( vOrigin, XMVectorSplatOne() ); + + // Build the corners of the frustum (in world space). + XMVECTOR RightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR RightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR LeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR LeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR vNear = XMVectorReplicatePtr( &Near ); + XMVECTOR vFar = XMVectorReplicatePtr( &Far ); + + RightTop = XMVector3Rotate( RightTop, vOrientation ); + RightBottom = XMVector3Rotate( RightBottom, vOrientation ); + LeftTop = XMVector3Rotate( LeftTop, vOrientation ); + LeftBottom = XMVector3Rotate( LeftBottom, vOrientation ); + + XMVECTOR Corners0 = vOrigin + RightTop * vNear; + XMVECTOR Corners1 = vOrigin + RightBottom * vNear; + XMVECTOR Corners2 = vOrigin + LeftTop * vNear; + XMVECTOR Corners3 = vOrigin + LeftBottom * vNear; + XMVECTOR Corners4 = vOrigin + RightTop * vFar; + XMVECTOR Corners5 = vOrigin + RightBottom * vFar; + XMVECTOR Corners6 = vOrigin + LeftTop * vFar; + XMVECTOR Corners7 = vOrigin + LeftBottom * vFar; + + XMVECTOR Outside, Inside; + DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane, Outside, Inside ); + + // If the frustum is outside any plane it is outside. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return FRONT; + + // If the frustum is inside all planes it is inside. + if ( XMVector4EqualInt( Inside, XMVectorTrueInt() ) ) + return BACK; + + // The frustum is not inside all planes or outside a plane it intersects. + return INTERSECTING; +} + + +//----------------------------------------------------------------------------- +// Ray vs. frustum test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool BoundingFrustum::Intersects( FXMVECTOR rayOrigin, FXMVECTOR Direction, float& Dist ) const +{ + // If ray starts inside the frustum, return a distance of 0 for the hit + if ( Contains(rayOrigin) == CONTAINS ) + { + Dist = 0.0f; + return true; + } + + // Build the frustum planes. + XMVECTOR Planes[6]; + Planes[0] = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); + Planes[1] = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); + Planes[2] = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + Planes[3] = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + Planes[4] = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + Planes[5] = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + + // Load origin and orientation of the frustum. + XMVECTOR frOrigin = XMLoadFloat3( &Origin ); + XMVECTOR frOrientation = XMLoadFloat4( &Orientation ); + + // This algorithm based on "Fast Ray-Convex Polyhedron Intersectin," in James Arvo, ed., Graphics Gems II pp. 247-250 + float tnear = -FLT_MAX; + float tfar = FLT_MAX; + + for( size_t i=0; i < 6; ++i ) + { + XMVECTOR Plane = DirectX::Internal::XMPlaneTransform( Planes[i], frOrientation, frOrigin ); + Plane = XMPlaneNormalize( Plane ); + + XMVECTOR AxisDotOrigin = XMPlaneDotCoord( Plane, rayOrigin ); + XMVECTOR AxisDotDirection = XMVector3Dot( Plane, Direction ); + + if ( XMVector3LessOrEqual( XMVectorAbs( AxisDotDirection ), g_RayEpsilon ) ) + { + // Ray is parallel to plane - check if ray origin is inside plane's + if ( XMVector3Greater( AxisDotOrigin, g_XMZero ) ) + { + // Ray origin is outside half-space. + Dist = 0.f; + return false; + } + } + else + { + // Ray not parallel - get distance to plane. + float vd = XMVectorGetX( AxisDotDirection ); + float vn = XMVectorGetX( AxisDotOrigin ); + float t = -vn / vd; + if (vd < 0.0f) + { + // Front face - T is a near point. + if (t > tfar) + { + Dist = 0.f; + return false; + } + if (t > tnear) + { + // Hit near face. + tnear = t; + } + } + else + { + // back face - T is far point. + if (t < tnear) + { + Dist = 0.f; + return false; + } + if (t < tfar) + { + // Hit far face. + tfar = t; + } + } + } + } + + // Survived all tests. + // Note: if ray originates on polyhedron, may want to change 0.0f to some + // epsilon to avoid intersecting the originating face. + float distance = ( tnear >= 0.0f ) ? tnear : tfar; + if (distance >= 0.0f) + { + Dist = distance; + return true; + } + + Dist = 0.f; + return false; +} + + +//----------------------------------------------------------------------------- +// Test a frustum vs 6 planes (typically forming another frustum). +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType BoundingFrustum::ContainedBy( FXMVECTOR Plane0, FXMVECTOR Plane1, FXMVECTOR Plane2, + GXMVECTOR Plane3, CXMVECTOR Plane4, CXMVECTOR Plane5 ) const +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + assert( DirectX::Internal::XMQuaternionIsUnit( vOrientation ) ); + + // Set w of the origin to one so we can dot4 with a plane. + vOrigin = XMVectorInsert<0, 0, 0, 0, 1>( vOrigin, XMVectorSplatOne() ); + + // Build the corners of the frustum (in world space). + XMVECTOR RightTop = XMVectorSet( RightSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR RightBottom = XMVectorSet( RightSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR LeftTop = XMVectorSet( LeftSlope, TopSlope, 1.0f, 0.0f ); + XMVECTOR LeftBottom = XMVectorSet( LeftSlope, BottomSlope, 1.0f, 0.0f ); + XMVECTOR vNear = XMVectorReplicatePtr( &Near ); + XMVECTOR vFar = XMVectorReplicatePtr( &Far ); + + RightTop = XMVector3Rotate( RightTop, vOrientation ); + RightBottom = XMVector3Rotate( RightBottom, vOrientation ); + LeftTop = XMVector3Rotate( LeftTop, vOrientation ); + LeftBottom = XMVector3Rotate( LeftBottom, vOrientation ); + + XMVECTOR Corners0 = vOrigin + RightTop * vNear; + XMVECTOR Corners1 = vOrigin + RightBottom * vNear; + XMVECTOR Corners2 = vOrigin + LeftTop * vNear; + XMVECTOR Corners3 = vOrigin + LeftBottom * vNear; + XMVECTOR Corners4 = vOrigin + RightTop * vFar; + XMVECTOR Corners5 = vOrigin + RightBottom * vFar; + XMVECTOR Corners6 = vOrigin + LeftTop * vFar; + XMVECTOR Corners7 = vOrigin + LeftBottom * vFar; + + XMVECTOR Outside, Inside; + + // Test against each plane. + DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane0, Outside, Inside ); + + XMVECTOR AnyOutside = Outside; + XMVECTOR AllInside = Inside; + + DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane1, Outside, Inside ); + + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane2, Outside, Inside ); + + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane3, Outside, Inside ); + + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane4, Outside, Inside ); + + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectFrustumPlane( Corners0, Corners1, Corners2, Corners3, + Corners4, Corners5, Corners6, Corners7, + Plane5, Outside, Inside ); + + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + // If the frustum is outside any plane it is outside. + if ( XMVector4EqualInt( AnyOutside, XMVectorTrueInt() ) ) + return DISJOINT; + + // If the frustum is inside all planes it is inside. + if ( XMVector4EqualInt( AllInside, XMVectorTrueInt() ) ) + return CONTAINS; + + // The frustum is not inside all planes or outside a plane, it may intersect. + return INTERSECTS; +} + + +//----------------------------------------------------------------------------- +// Build the 6 frustum planes from a frustum. +// +// The intended use for these routines is for fast culling to a view frustum. +// When the volume being tested against a view frustum is small relative to the +// view frustum it is usually either inside all six planes of the frustum +// (CONTAINS) or outside one of the planes of the frustum (DISJOINT). If neither +// of these cases is true then it may or may not be intersecting the frustum +// (INTERSECTS) +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingFrustum::GetPlanes( XMVECTOR* NearPlane, XMVECTOR* FarPlane, XMVECTOR* RightPlane, + XMVECTOR* LeftPlane, XMVECTOR* TopPlane, XMVECTOR* BottomPlane ) const +{ + // Load origin and orientation of the frustum. + XMVECTOR vOrigin = XMLoadFloat3( &Origin ); + XMVECTOR vOrientation = XMLoadFloat4( &Orientation ); + + if (NearPlane) + { + XMVECTOR vNearPlane = XMVectorSet( 0.0f, 0.0f, -1.0f, Near ); + vNearPlane = DirectX::Internal::XMPlaneTransform( vNearPlane, vOrientation, vOrigin ); + *NearPlane = XMPlaneNormalize( vNearPlane ); + } + + if (FarPlane) + { + XMVECTOR vFarPlane = XMVectorSet( 0.0f, 0.0f, 1.0f, -Far ); + vFarPlane = DirectX::Internal::XMPlaneTransform( vFarPlane, vOrientation, vOrigin ); + *FarPlane = XMPlaneNormalize( vFarPlane ); + } + + if (RightPlane) + { + XMVECTOR vRightPlane = XMVectorSet( 1.0f, 0.0f, -RightSlope, 0.0f ); + vRightPlane = DirectX::Internal::XMPlaneTransform( vRightPlane, vOrientation, vOrigin ); + *RightPlane = XMPlaneNormalize( vRightPlane ); + } + + if (LeftPlane) + { + XMVECTOR vLeftPlane = XMVectorSet( -1.0f, 0.0f, LeftSlope, 0.0f ); + vLeftPlane = DirectX::Internal::XMPlaneTransform( vLeftPlane, vOrientation, vOrigin ); + *LeftPlane = XMPlaneNormalize( vLeftPlane ); + } + + if (TopPlane) + { + XMVECTOR vTopPlane = XMVectorSet( 0.0f, 1.0f, -TopSlope, 0.0f ); + vTopPlane = DirectX::Internal::XMPlaneTransform( vTopPlane, vOrientation, vOrigin ); + *TopPlane = XMPlaneNormalize( vTopPlane ); + } + + if (BottomPlane) + { + XMVECTOR vBottomPlane = XMVectorSet( 0.0f, -1.0f, BottomSlope, 0.0f ); + vBottomPlane = DirectX::Internal::XMPlaneTransform( vBottomPlane, vOrientation, vOrigin ); + *BottomPlane = XMPlaneNormalize( vBottomPlane ); + } +} + + +//----------------------------------------------------------------------------- +// Build a frustum from a persepective projection matrix. The matrix may only +// contain a projection; any rotation, translation or scale will cause the +// constructed frustum to be incorrect. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline void BoundingFrustum::CreateFromMatrix( BoundingFrustum& Out, CXMMATRIX Projection ) +{ + // Corners of the projection frustum in homogenous space. + static XMVECTORF32 HomogenousPoints[6] = + { + { 1.0f, 0.0f, 1.0f, 1.0f }, // right (at far plane) + { -1.0f, 0.0f, 1.0f, 1.0f }, // left + { 0.0f, 1.0f, 1.0f, 1.0f }, // top + { 0.0f, -1.0f, 1.0f, 1.0f }, // bottom + + { 0.0f, 0.0f, 0.0f, 1.0f }, // near + { 0.0f, 0.0f, 1.0f, 1.0f } // far + }; + + XMVECTOR Determinant; + XMMATRIX matInverse = XMMatrixInverse( &Determinant, Projection ); + + // Compute the frustum corners in world space. + XMVECTOR Points[6]; + + for( size_t i = 0; i < 6; ++i ) + { + // Transform point. + Points[i] = XMVector4Transform( HomogenousPoints[i], matInverse ); + } + + Out.Origin = XMFLOAT3( 0.0f, 0.0f, 0.0f ); + Out.Orientation = XMFLOAT4( 0.0f, 0.0f, 0.0f, 1.0f ); + + // Compute the slopes. + Points[0] = Points[0] * XMVectorReciprocal( XMVectorSplatZ( Points[0] ) ); + Points[1] = Points[1] * XMVectorReciprocal( XMVectorSplatZ( Points[1] ) ); + Points[2] = Points[2] * XMVectorReciprocal( XMVectorSplatZ( Points[2] ) ); + Points[3] = Points[3] * XMVectorReciprocal( XMVectorSplatZ( Points[3] ) ); + + Out.RightSlope = XMVectorGetX( Points[0] ); + Out.LeftSlope = XMVectorGetX( Points[1] ); + Out.TopSlope = XMVectorGetY( Points[2] ); + Out.BottomSlope = XMVectorGetY( Points[3] ); + + // Compute near and far. + Points[4] = Points[4] * XMVectorReciprocal( XMVectorSplatW( Points[4] ) ); + Points[5] = Points[5] * XMVectorReciprocal( XMVectorSplatW( Points[5] ) ); + + Out.Near = XMVectorGetZ( Points[4] ); + Out.Far = XMVectorGetZ( Points[5] ); +} + + +/**************************************************************************** + * + * TriangleTests + * + ****************************************************************************/ + +namespace TriangleTests +{ + +//----------------------------------------------------------------------------- +// Compute the intersection of a ray (Origin, Direction) with a triangle +// (V0, V1, V2). Return true if there is an intersection and also set *pDist +// to the distance along the ray to the intersection. +// +// The algorithm is based on Moller, Tomas and Trumbore, "Fast, Minimum Storage +// Ray-Triangle Intersection", Journal of Graphics Tools, vol. 2, no. 1, +// pp 21-28, 1997. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool Intersects( FXMVECTOR Origin, FXMVECTOR Direction, FXMVECTOR V0, GXMVECTOR V1, CXMVECTOR V2, float& Dist ) +{ + assert( DirectX::Internal::XMVector3IsUnit( Direction ) ); + + XMVECTOR Zero = XMVectorZero(); + + XMVECTOR e1 = V1 - V0; + XMVECTOR e2 = V2 - V0; + + // p = Direction ^ e2; + XMVECTOR p = XMVector3Cross( Direction, e2 ); + + // det = e1 * p; + XMVECTOR det = XMVector3Dot( e1, p ); + + XMVECTOR u, v, t; + + if( XMVector3GreaterOrEqual( det, g_RayEpsilon ) ) + { + // Determinate is positive (front side of the triangle). + XMVECTOR s = Origin - V0; + + // u = s * p; + u = XMVector3Dot( s, p ); + + XMVECTOR NoIntersection = XMVectorLess( u, Zero ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( u, det ) ); + + // q = s ^ e1; + XMVECTOR q = XMVector3Cross( s, e1 ); + + // v = Direction * q; + v = XMVector3Dot( Direction, q ); + + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( v, Zero ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( u + v, det ) ); + + // t = e2 * q; + t = XMVector3Dot( e2, q ); + + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( t, Zero ) ); + + if( XMVector4EqualInt( NoIntersection, XMVectorTrueInt() ) ) + { + Dist = 0.f; + return false; + } + } + else if( XMVector3LessOrEqual( det, g_RayNegEpsilon ) ) + { + // Determinate is negative (back side of the triangle). + XMVECTOR s = Origin - V0; + + // u = s * p; + u = XMVector3Dot( s, p ); + + XMVECTOR NoIntersection = XMVectorGreater( u, Zero ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( u, det ) ); + + // q = s ^ e1; + XMVECTOR q = XMVector3Cross( s, e1 ); + + // v = Direction * q; + v = XMVector3Dot( Direction, q ); + + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( v, Zero ) ); + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorLess( u + v, det ) ); + + // t = e2 * q; + t = XMVector3Dot( e2, q ); + + NoIntersection = XMVectorOrInt( NoIntersection, XMVectorGreater( t, Zero ) ); + + if ( XMVector4EqualInt( NoIntersection, XMVectorTrueInt() ) ) + { + Dist = 0.f; + return false; + } + } + else + { + // Parallel ray. + Dist = 0.f; + return false; + } + + t = XMVectorDivide ( t, det ); + + // (u / det) and (v / dev) are the barycentric cooridinates of the intersection. + + // Store the x-component to *pDist + XMStoreFloat( &Dist, t ); + + return true; +} + + +//----------------------------------------------------------------------------- +// Test if two triangles intersect. +// +// The final test of algorithm is based on Shen, Heng, and Tang, "A Fast +// Triangle-Triangle Overlap Test Using Signed Distances", Journal of Graphics +// Tools, vol. 8, no. 1, pp 17-23, 2003 and Guigue and Devillers, "Fast and +// Robust Triangle-Triangle Overlap Test Using Orientation Predicates", Journal +// of Graphics Tools, vol. 8, no. 1, pp 25-32, 2003. +// +// The final test could be considered an edge-edge separating plane test with +// the 9 possible cases narrowed down to the only two pairs of edges that can +// actaully result in a seperation. +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline bool Intersects( FXMVECTOR A0, FXMVECTOR A1, FXMVECTOR A2, GXMVECTOR B0, CXMVECTOR B1, CXMVECTOR B2 ) +{ + static const XMVECTORI32 SelectY = + { + XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0 + }; + static const XMVECTORI32 SelectZ = + { + XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 + }; + static const XMVECTORI32 Select0111 = + { + XM_SELECT_0, XM_SELECT_1, XM_SELECT_1, XM_SELECT_1 + }; + static const XMVECTORI32 Select1011 = + { + XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_1 + }; + static const XMVECTORI32 Select1101 = + { + XM_SELECT_1, XM_SELECT_1, XM_SELECT_0, XM_SELECT_1 + }; + + XMVECTOR Zero = XMVectorZero(); + + // Compute the normal of triangle A. + XMVECTOR N1 = XMVector3Cross( A1 - A0, A2 - A0 ); + + // Assert that the triangle is not degenerate. + assert( !XMVector3Equal( N1, Zero ) ); + + // Test points of B against the plane of A. + XMVECTOR BDist = XMVector3Dot( N1, B0 - A0 ); + BDist = XMVectorSelect( BDist, XMVector3Dot( N1, B1 - A0 ), SelectY ); + BDist = XMVectorSelect( BDist, XMVector3Dot( N1, B2 - A0 ), SelectZ ); + + // Ensure robustness with co-planar triangles by zeroing small distances. + uint32_t BDistIsZeroCR; + XMVECTOR BDistIsZero = XMVectorGreaterR( &BDistIsZeroCR, g_RayEpsilon, XMVectorAbs( BDist ) ); + BDist = XMVectorSelect( BDist, Zero, BDistIsZero ); + + uint32_t BDistIsLessCR; + XMVECTOR BDistIsLess = XMVectorGreaterR( &BDistIsLessCR, Zero, BDist ); + + uint32_t BDistIsGreaterCR; + XMVECTOR BDistIsGreater = XMVectorGreaterR( &BDistIsGreaterCR, BDist, Zero ); + + // If all the points are on the same side we don't intersect. + if( XMComparisonAllTrue( BDistIsLessCR ) || XMComparisonAllTrue( BDistIsGreaterCR ) ) + return false; + + // Compute the normal of triangle B. + XMVECTOR N2 = XMVector3Cross( B1 - B0, B2 - B0 ); + + // Assert that the triangle is not degenerate. + assert( !XMVector3Equal( N2, Zero ) ); + + // Test points of A against the plane of B. + XMVECTOR ADist = XMVector3Dot( N2, A0 - B0 ); + ADist = XMVectorSelect( ADist, XMVector3Dot( N2, A1 - B0 ), SelectY ); + ADist = XMVectorSelect( ADist, XMVector3Dot( N2, A2 - B0 ), SelectZ ); + + // Ensure robustness with co-planar triangles by zeroing small distances. + uint32_t ADistIsZeroCR; + XMVECTOR ADistIsZero = XMVectorGreaterR( &ADistIsZeroCR, g_RayEpsilon, XMVectorAbs( BDist ) ); + ADist = XMVectorSelect( ADist, Zero, ADistIsZero ); + + uint32_t ADistIsLessCR; + XMVECTOR ADistIsLess = XMVectorGreaterR( &ADistIsLessCR, Zero, ADist ); + + uint32_t ADistIsGreaterCR; + XMVECTOR ADistIsGreater = XMVectorGreaterR( &ADistIsGreaterCR, ADist, Zero ); + + // If all the points are on the same side we don't intersect. + if( XMComparisonAllTrue( ADistIsLessCR ) || XMComparisonAllTrue( ADistIsGreaterCR ) ) + return false; + + // Special case for co-planar triangles. + if( XMComparisonAllTrue( ADistIsZeroCR ) || XMComparisonAllTrue( BDistIsZeroCR ) ) + { + XMVECTOR Axis, Dist, MinDist; + + // Compute an axis perpindicular to the edge (points out). + Axis = XMVector3Cross( N1, A1 - A0 ); + Dist = XMVector3Dot( Axis, A0 ); + + // Test points of B against the axis. + MinDist = XMVector3Dot( B0, Axis ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( B1, Axis ) ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( B2, Axis ) ); + if( XMVector4GreaterOrEqual( MinDist, Dist ) ) + return false; + + // Edge (A1, A2) + Axis = XMVector3Cross( N1, A2 - A1 ); + Dist = XMVector3Dot( Axis, A1 ); + + MinDist = XMVector3Dot( B0, Axis ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( B1, Axis ) ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( B2, Axis ) ); + if( XMVector4GreaterOrEqual( MinDist, Dist ) ) + return false; + + // Edge (A2, A0) + Axis = XMVector3Cross( N1, A0 - A2 ); + Dist = XMVector3Dot( Axis, A2 ); + + MinDist = XMVector3Dot( B0, Axis ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( B1, Axis ) ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( B2, Axis ) ); + if( XMVector4GreaterOrEqual( MinDist, Dist ) ) + return false; + + // Edge (B0, B1) + Axis = XMVector3Cross( N2, B1 - B0 ); + Dist = XMVector3Dot( Axis, B0 ); + + MinDist = XMVector3Dot( A0, Axis ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( A1, Axis ) ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( A2, Axis ) ); + if( XMVector4GreaterOrEqual( MinDist, Dist ) ) + return false; + + // Edge (B1, B2) + Axis = XMVector3Cross( N2, B2 - B1 ); + Dist = XMVector3Dot( Axis, B1 ); + + MinDist = XMVector3Dot( A0, Axis ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( A1, Axis ) ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( A2, Axis ) ); + if( XMVector4GreaterOrEqual( MinDist, Dist ) ) + return false; + + // Edge (B2,B0) + Axis = XMVector3Cross( N2, B0 - B2 ); + Dist = XMVector3Dot( Axis, B2 ); + + MinDist = XMVector3Dot( A0, Axis ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( A1, Axis ) ); + MinDist = XMVectorMin( MinDist, XMVector3Dot( A2, Axis ) ); + if( XMVector4GreaterOrEqual( MinDist, Dist ) ) + return false; + + return true; + } + + // + // Find the single vertex of A and B (ie the vertex on the opposite side + // of the plane from the other two) and reorder the edges so we can compute + // the signed edge/edge distances. + // + // if ( (V0 >= 0 && V1 < 0 && V2 < 0) || + // (V0 > 0 && V1 <= 0 && V2 <= 0) || + // (V0 <= 0 && V1 > 0 && V2 > 0) || + // (V0 < 0 && V1 >= 0 && V2 >= 0) ) then V0 is singular; + // + // If our singular vertex is not on the positive side of the plane we reverse + // the triangle winding so that the overlap comparisons will compare the + // correct edges with the correct signs. + // + XMVECTOR ADistIsLessEqual = XMVectorOrInt( ADistIsLess, ADistIsZero ); + XMVECTOR ADistIsGreaterEqual = XMVectorOrInt( ADistIsGreater, ADistIsZero ); + + XMVECTOR AA0, AA1, AA2; + bool bPositiveA; + + if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreaterEqual, ADistIsLess, Select0111 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreater, ADistIsLessEqual, Select0111 ) ) ) + { + // A0 is singular, crossing from positive to negative. + AA0 = A0; AA1 = A1; AA2 = A2; + bPositiveA = true; + } + else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLessEqual, ADistIsGreater, Select0111 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLess, ADistIsGreaterEqual, Select0111 ) ) ) + { + // A0 is singular, crossing from negative to positive. + AA0 = A0; AA1 = A2; AA2 = A1; + bPositiveA = false; + } + else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreaterEqual, ADistIsLess, Select1011 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreater, ADistIsLessEqual, Select1011 ) ) ) + { + // A1 is singular, crossing from positive to negative. + AA0 = A1; AA1 = A2; AA2 = A0; + bPositiveA = true; + } + else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLessEqual, ADistIsGreater, Select1011 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLess, ADistIsGreaterEqual, Select1011 ) ) ) + { + // A1 is singular, crossing from negative to positive. + AA0 = A1; AA1 = A0; AA2 = A2; + bPositiveA = false; + } + else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreaterEqual, ADistIsLess, Select1101 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsGreater, ADistIsLessEqual, Select1101 ) ) ) + { + // A2 is singular, crossing from positive to negative. + AA0 = A2; AA1 = A0; AA2 = A1; + bPositiveA = true; + } + else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLessEqual, ADistIsGreater, Select1101 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( ADistIsLess, ADistIsGreaterEqual, Select1101 ) ) ) + { + // A2 is singular, crossing from negative to positive. + AA0 = A2; AA1 = A1; AA2 = A0; + bPositiveA = false; + } + else + { + assert( false ); + return false; + } + + XMVECTOR BDistIsLessEqual = XMVectorOrInt( BDistIsLess, BDistIsZero ); + XMVECTOR BDistIsGreaterEqual = XMVectorOrInt( BDistIsGreater, BDistIsZero ); + + XMVECTOR BB0, BB1, BB2; + bool bPositiveB; + + if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreaterEqual, BDistIsLess, Select0111 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreater, BDistIsLessEqual, Select0111 ) ) ) + { + // B0 is singular, crossing from positive to negative. + BB0 = B0; BB1 = B1; BB2 = B2; + bPositiveB = true; + } + else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLessEqual, BDistIsGreater, Select0111 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLess, BDistIsGreaterEqual, Select0111 ) ) ) + { + // B0 is singular, crossing from negative to positive. + BB0 = B0; BB1 = B2; BB2 = B1; + bPositiveB = false; + } + else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreaterEqual, BDistIsLess, Select1011 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreater, BDistIsLessEqual, Select1011 ) ) ) + { + // B1 is singular, crossing from positive to negative. + BB0 = B1; BB1 = B2; BB2 = B0; + bPositiveB = true; + } + else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLessEqual, BDistIsGreater, Select1011 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLess, BDistIsGreaterEqual, Select1011 ) ) ) + { + // B1 is singular, crossing from negative to positive. + BB0 = B1; BB1 = B0; BB2 = B2; + bPositiveB = false; + } + else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreaterEqual, BDistIsLess, Select1101 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsGreater, BDistIsLessEqual, Select1101 ) ) ) + { + // B2 is singular, crossing from positive to negative. + BB0 = B2; BB1 = B0; BB2 = B1; + bPositiveB = true; + } + else if( DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLessEqual, BDistIsGreater, Select1101 ) ) || + DirectX::Internal::XMVector3AllTrue( XMVectorSelect( BDistIsLess, BDistIsGreaterEqual, Select1101 ) ) ) + { + // B2 is singular, crossing from negative to positive. + BB0 = B2; BB1 = B1; BB2 = B0; + bPositiveB = false; + } + else + { + assert( false ); + return false; + } + + XMVECTOR Delta0, Delta1; + + // Reverse the direction of the test depending on whether the singular vertices are + // the same sign or different signs. + if( bPositiveA ^ bPositiveB ) + { + Delta0 = ( BB0 - AA0 ); + Delta1 = ( AA0 - BB0 ); + } + else + { + Delta0 = ( AA0 - BB0 ); + Delta1 = ( BB0 - AA0 ); + } + + // Check if the triangles overlap on the line of intersection between the + // planes of the two triangles by finding the signed line distances. + XMVECTOR Dist0 = XMVector3Dot( Delta0, XMVector3Cross( ( BB2 - BB0 ), ( AA2 - AA0 ) ) ); + if( XMVector4Greater( Dist0, Zero ) ) + return false; + + XMVECTOR Dist1 = XMVector3Dot( Delta1, XMVector3Cross( ( BB1 - BB0 ), ( AA1 - AA0 ) ) ); + if( XMVector4Greater( Dist1, Zero ) ) + return false; + + return true; +} + + +//----------------------------------------------------------------------------- +// Ray-triangle test +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline PlaneIntersectionType Intersects( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2, GXMVECTOR Plane ) +{ + XMVECTOR One = XMVectorSplatOne(); + + assert( DirectX::Internal::XMPlaneIsUnit( Plane ) ); + + // Set w of the points to one so we can dot4 with a plane. + XMVECTOR TV0 = XMVectorInsert<0, 0, 0, 0, 1>(V0, One); + XMVECTOR TV1 = XMVectorInsert<0, 0, 0, 0, 1>(V1, One); + XMVECTOR TV2 = XMVectorInsert<0, 0, 0, 0, 1>(V2, One); + + XMVECTOR Outside, Inside; + DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane, Outside, Inside ); + + // If the triangle is outside any plane it is outside. + if ( XMVector4EqualInt( Outside, XMVectorTrueInt() ) ) + return FRONT; + + // If the triangle is inside all planes it is inside. + if ( XMVector4EqualInt( Inside, XMVectorTrueInt() ) ) + return BACK; + + // The triangle is not inside all planes or outside a plane it intersects. + return INTERSECTING; +} + + +//----------------------------------------------------------------------------- +// Test a triangle vs 6 planes (typically forming a frustum). +//----------------------------------------------------------------------------- +_Use_decl_annotations_ +inline ContainmentType ContainedBy( FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR V2, + GXMVECTOR Plane0, CXMVECTOR Plane1, CXMVECTOR Plane2, + CXMVECTOR Plane3, CXMVECTOR Plane4, CXMVECTOR Plane5 ) +{ + XMVECTOR One = XMVectorSplatOne(); + + // Set w of the points to one so we can dot4 with a plane. + XMVECTOR TV0 = XMVectorInsert<0, 0, 0, 0, 1>(V0, One); + XMVECTOR TV1 = XMVectorInsert<0, 0, 0, 0, 1>(V1, One); + XMVECTOR TV2 = XMVectorInsert<0, 0, 0, 0, 1>(V2, One); + + XMVECTOR Outside, Inside; + + // Test against each plane. + DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane0, Outside, Inside ); + + XMVECTOR AnyOutside = Outside; + XMVECTOR AllInside = Inside; + + DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane1, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane2, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane3, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane4, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + DirectX::Internal::FastIntersectTrianglePlane( TV0, TV1, TV2, Plane5, Outside, Inside ); + AnyOutside = XMVectorOrInt( AnyOutside, Outside ); + AllInside = XMVectorAndInt( AllInside, Inside ); + + // If the triangle is outside any plane it is outside. + if ( XMVector4EqualInt( AnyOutside, XMVectorTrueInt() ) ) + return DISJOINT; + + // If the triangle is inside all planes it is inside. + if ( XMVector4EqualInt( AllInside, XMVectorTrueInt() ) ) + return CONTAINS; + + // The triangle is not inside all planes or outside a plane, it may intersect. + return INTERSECTS; +} + +}; // namespace TriangleTests + diff --git a/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXColors.h b/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXColors.h new file mode 100644 index 00000000..b728302c --- /dev/null +++ b/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXColors.h @@ -0,0 +1,168 @@ +//------------------------------------------------------------------------------------- +// DirectXColors.h -- C++ Color Math library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------------------------------- + +#ifdef _MSC_VER +#pragma once +#endif + +#include "DirectXMath.h" + +namespace DirectX +{ + +namespace Colors +{ + // Standard colors (Red/Green/Blue/Alpha) + XMGLOBALCONST XMVECTORF32 AliceBlue = {0.941176534f, 0.972549081f, 1.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 AntiqueWhite = {0.980392218f, 0.921568692f, 0.843137324f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Aqua = {0.000000000f, 1.000000000f, 1.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Aquamarine = {0.498039246f, 1.000000000f, 0.831372619f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Azure = {0.941176534f, 1.000000000f, 1.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Beige = {0.960784376f, 0.960784376f, 0.862745166f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Bisque = {1.000000000f, 0.894117713f, 0.768627524f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Black = {0.000000000f, 0.000000000f, 0.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 BlanchedAlmond = {1.000000000f, 0.921568692f, 0.803921640f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Blue = {0.000000000f, 0.000000000f, 1.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 BlueViolet = {0.541176498f, 0.168627456f, 0.886274576f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Brown = {0.647058845f, 0.164705887f, 0.164705887f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 BurlyWood = {0.870588303f, 0.721568644f, 0.529411793f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 CadetBlue = {0.372549027f, 0.619607866f, 0.627451003f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Chartreuse = {0.498039246f, 1.000000000f, 0.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Chocolate = {0.823529482f, 0.411764741f, 0.117647067f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Coral = {1.000000000f, 0.498039246f, 0.313725501f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 CornflowerBlue = {0.392156899f, 0.584313750f, 0.929411829f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Cornsilk = {1.000000000f, 0.972549081f, 0.862745166f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Crimson = {0.862745166f, 0.078431375f, 0.235294133f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Cyan = {0.000000000f, 1.000000000f, 1.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkBlue = {0.000000000f, 0.000000000f, 0.545098066f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkCyan = {0.000000000f, 0.545098066f, 0.545098066f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkGoldenrod = {0.721568644f, 0.525490224f, 0.043137256f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkGray = {0.662745118f, 0.662745118f, 0.662745118f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkGreen = {0.000000000f, 0.392156899f, 0.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkKhaki = {0.741176486f, 0.717647076f, 0.419607878f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkMagenta = {0.545098066f, 0.000000000f, 0.545098066f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkOliveGreen = {0.333333343f, 0.419607878f, 0.184313729f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkOrange = {1.000000000f, 0.549019635f, 0.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkOrchid = {0.600000024f, 0.196078449f, 0.800000072f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkRed = {0.545098066f, 0.000000000f, 0.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkSalmon = {0.913725555f, 0.588235319f, 0.478431404f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkSeaGreen = {0.560784340f, 0.737254918f, 0.545098066f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkSlateBlue = {0.282352954f, 0.239215702f, 0.545098066f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkSlateGray = {0.184313729f, 0.309803933f, 0.309803933f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkTurquoise = {0.000000000f, 0.807843208f, 0.819607913f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DarkViolet = {0.580392182f, 0.000000000f, 0.827451050f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DeepPink = {1.000000000f, 0.078431375f, 0.576470613f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DeepSkyBlue = {0.000000000f, 0.749019623f, 1.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DimGray = {0.411764741f, 0.411764741f, 0.411764741f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 DodgerBlue = {0.117647067f, 0.564705908f, 1.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Firebrick = {0.698039234f, 0.133333340f, 0.133333340f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 FloralWhite = {1.000000000f, 0.980392218f, 0.941176534f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 ForestGreen = {0.133333340f, 0.545098066f, 0.133333340f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Fuchsia = {1.000000000f, 0.000000000f, 1.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Gainsboro = {0.862745166f, 0.862745166f, 0.862745166f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 GhostWhite = {0.972549081f, 0.972549081f, 1.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Gold = {1.000000000f, 0.843137324f, 0.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Goldenrod = {0.854902029f, 0.647058845f, 0.125490203f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Gray = {0.501960814f, 0.501960814f, 0.501960814f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Green = {0.000000000f, 0.501960814f, 0.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 GreenYellow = {0.678431392f, 1.000000000f, 0.184313729f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Honeydew = {0.941176534f, 1.000000000f, 0.941176534f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 HotPink = {1.000000000f, 0.411764741f, 0.705882370f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 IndianRed = {0.803921640f, 0.360784322f, 0.360784322f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Indigo = {0.294117659f, 0.000000000f, 0.509803951f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Ivory = {1.000000000f, 1.000000000f, 0.941176534f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Khaki = {0.941176534f, 0.901960850f, 0.549019635f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Lavender = {0.901960850f, 0.901960850f, 0.980392218f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LavenderBlush = {1.000000000f, 0.941176534f, 0.960784376f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LawnGreen = {0.486274540f, 0.988235354f, 0.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LemonChiffon = {1.000000000f, 0.980392218f, 0.803921640f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LightBlue = {0.678431392f, 0.847058892f, 0.901960850f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LightCoral = {0.941176534f, 0.501960814f, 0.501960814f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LightCyan = {0.878431439f, 1.000000000f, 1.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LightGoldenrodYellow = {0.980392218f, 0.980392218f, 0.823529482f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LightGreen = {0.564705908f, 0.933333397f, 0.564705908f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LightGray = {0.827451050f, 0.827451050f, 0.827451050f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LightPink = {1.000000000f, 0.713725507f, 0.756862819f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LightSalmon = {1.000000000f, 0.627451003f, 0.478431404f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LightSeaGreen = {0.125490203f, 0.698039234f, 0.666666687f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LightSkyBlue = {0.529411793f, 0.807843208f, 0.980392218f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LightSlateGray = {0.466666698f, 0.533333361f, 0.600000024f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LightSteelBlue = {0.690196097f, 0.768627524f, 0.870588303f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LightYellow = {1.000000000f, 1.000000000f, 0.878431439f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Lime = {0.000000000f, 1.000000000f, 0.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 LimeGreen = {0.196078449f, 0.803921640f, 0.196078449f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Linen = {0.980392218f, 0.941176534f, 0.901960850f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Magenta = {1.000000000f, 0.000000000f, 1.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Maroon = {0.501960814f, 0.000000000f, 0.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 MediumAquamarine = {0.400000036f, 0.803921640f, 0.666666687f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 MediumBlue = {0.000000000f, 0.000000000f, 0.803921640f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 MediumOrchid = {0.729411781f, 0.333333343f, 0.827451050f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 MediumPurple = {0.576470613f, 0.439215720f, 0.858823597f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 MediumSeaGreen = {0.235294133f, 0.701960802f, 0.443137288f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 MediumSlateBlue = {0.482352972f, 0.407843173f, 0.933333397f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 MediumSpringGreen = {0.000000000f, 0.980392218f, 0.603921592f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 MediumTurquoise = {0.282352954f, 0.819607913f, 0.800000072f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 MediumVioletRed = {0.780392230f, 0.082352944f, 0.521568656f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 MidnightBlue = {0.098039225f, 0.098039225f, 0.439215720f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 MintCream = {0.960784376f, 1.000000000f, 0.980392218f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 MistyRose = {1.000000000f, 0.894117713f, 0.882353008f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Moccasin = {1.000000000f, 0.894117713f, 0.709803939f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 NavajoWhite = {1.000000000f, 0.870588303f, 0.678431392f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Navy = {0.000000000f, 0.000000000f, 0.501960814f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 OldLace = {0.992156923f, 0.960784376f, 0.901960850f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Olive = {0.501960814f, 0.501960814f, 0.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 OliveDrab = {0.419607878f, 0.556862772f, 0.137254909f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Orange = {1.000000000f, 0.647058845f, 0.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 OrangeRed = {1.000000000f, 0.270588249f, 0.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Orchid = {0.854902029f, 0.439215720f, 0.839215755f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 PaleGoldenrod = {0.933333397f, 0.909803987f, 0.666666687f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 PaleGreen = {0.596078455f, 0.984313786f, 0.596078455f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 PaleTurquoise = {0.686274529f, 0.933333397f, 0.933333397f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 PaleVioletRed = {0.858823597f, 0.439215720f, 0.576470613f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 PapayaWhip = {1.000000000f, 0.937254965f, 0.835294187f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 PeachPuff = {1.000000000f, 0.854902029f, 0.725490212f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Peru = {0.803921640f, 0.521568656f, 0.247058839f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Pink = {1.000000000f, 0.752941251f, 0.796078503f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Plum = {0.866666734f, 0.627451003f, 0.866666734f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 PowderBlue = {0.690196097f, 0.878431439f, 0.901960850f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Purple = {0.501960814f, 0.000000000f, 0.501960814f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Red = {1.000000000f, 0.000000000f, 0.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 RosyBrown = {0.737254918f, 0.560784340f, 0.560784340f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 RoyalBlue = {0.254901975f, 0.411764741f, 0.882353008f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 SaddleBrown = {0.545098066f, 0.270588249f, 0.074509807f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Salmon = {0.980392218f, 0.501960814f, 0.447058856f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 SandyBrown = {0.956862807f, 0.643137276f, 0.376470625f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 SeaGreen = {0.180392161f, 0.545098066f, 0.341176480f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 SeaShell = {1.000000000f, 0.960784376f, 0.933333397f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Sienna = {0.627451003f, 0.321568638f, 0.176470593f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Silver = {0.752941251f, 0.752941251f, 0.752941251f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 SkyBlue = {0.529411793f, 0.807843208f, 0.921568692f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 SlateBlue = {0.415686309f, 0.352941185f, 0.803921640f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 SlateGray = {0.439215720f, 0.501960814f, 0.564705908f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Snow = {1.000000000f, 0.980392218f, 0.980392218f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 SpringGreen = {0.000000000f, 1.000000000f, 0.498039246f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 SteelBlue = {0.274509817f, 0.509803951f, 0.705882370f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Tan = {0.823529482f, 0.705882370f, 0.549019635f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Teal = {0.000000000f, 0.501960814f, 0.501960814f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Thistle = {0.847058892f, 0.749019623f, 0.847058892f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Tomato = {1.000000000f, 0.388235331f, 0.278431386f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Transparent = {0.000000000f, 0.000000000f, 0.000000000f, 0.000000000f}; + XMGLOBALCONST XMVECTORF32 Turquoise = {0.250980407f, 0.878431439f, 0.815686345f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Violet = {0.933333397f, 0.509803951f, 0.933333397f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Wheat = {0.960784376f, 0.870588303f, 0.701960802f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 White = {1.000000000f, 1.000000000f, 1.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 WhiteSmoke = {0.960784376f, 0.960784376f, 0.960784376f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 Yellow = {1.000000000f, 1.000000000f, 0.000000000f, 1.000000000f}; + XMGLOBALCONST XMVECTORF32 YellowGreen = {0.603921592f, 0.803921640f, 0.196078449f, 1.000000000f}; + +}; // namespace Colors + +}; // namespace DirectX diff --git a/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXMath.h b/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXMath.h new file mode 100644 index 00000000..c79ef233 --- /dev/null +++ b/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXMath.h @@ -0,0 +1,1861 @@ +//------------------------------------------------------------------------------------- +// DirectXMath.h -- SIMD C++ Math library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------------------------------- + + +// MGH ------------------- +#define _XM_BIGENDIAN_ +#define _XM_NO_INTRINSICS_ +// ----------------------- + +#ifdef _MSC_VER +#pragma once +#endif + +#ifndef __cplusplus +#error DirectX Math requires C++ +#endif + +#define DIRECTX_MATH_VERSION 303 + +#if !defined(_XM_BIGENDIAN_) && !defined(_XM_LITTLEENDIAN_) +#if defined(_M_AMD64) || defined(_M_IX86) || defined(_M_ARM) +#define _XM_LITTLEENDIAN_ +#elif defined(_M_PPCBE) +#define _XM_BIGENDIAN_ +#else +#error DirectX Math does not support this target +#endif +#endif // !_XM_BIGENDIAN_ && !_XM_LITTLEENDIAN_ + + + +#if !defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_SSE_INTRINSICS_) && !defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +#if defined(_M_IX86) || defined(_M_AMD64) +#define _XM_SSE_INTRINSICS_ +#elif defined(_M_PPCBE) +#define _XM_VMX128_INTRINSICS_ +#elif defined(_M_ARM) +#define _XM_ARM_NEON_INTRINSICS_ +#elif !defined(_XM_NO_INTRINSICS_) +#error DirectX Math does not support this target +#endif +#endif // !_XM_ARM_NEON_INTRINSICS_ && !_XM_SSE_INTRINSICS_ && !_XM_VMX128_INTRINSICS_ && !_XM_NO_INTRINSICS_ + +#pragma warning(push) +#pragma warning(disable:4514 4820 4985) +#include <cmath> +#include <float.h> +// MGH - #include <malloc.h> +#pragma warning(pop) + + +#if defined(_XM_SSE_INTRINSICS_) +#ifndef _XM_NO_INTRINSICS_ +#include <xmmintrin.h> +#include <emmintrin.h> +#endif +#elif defined(_XM_VMX128_INTRINSICS_) +#error This version of DirectX Math does not support Xbox 360 +#elif defined(_XM_ARM_NEON_INTRINSICS_) +#ifndef _XM_NO_INTRINSICS_ +#include <arm_neon.h> +#endif +#endif + + + +#include <DirectX/no_sal2.h> +#include <assert.h> + + +#pragma warning(push) +#pragma warning(disable : 4005 4668) +#include <stdint.h> +#pragma warning(pop) + + +namespace DirectX +{ + +/**************************************************************************** + * + * Constant definitions + * + ****************************************************************************/ + +#if defined(__XNAMATH_H__) && defined(XM_PI) +#undef XM_PI +#undef XM_2PI +#undef XM_1DIVPI +#undef XM_1DIV2PI +#undef XM_PIDIV2 +#undef XM_PIDIV4 +#undef XM_SELECT_0 +#undef XM_SELECT_1 +#undef XM_PERMUTE_0X +#undef XM_PERMUTE_0Y +#undef XM_PERMUTE_0Z +#undef XM_PERMUTE_0W +#undef XM_PERMUTE_1X +#undef XM_PERMUTE_1Y +#undef XM_PERMUTE_1Z +#undef XM_PERMUTE_1W +#undef XM_CRMASK_CR6 +#undef XM_CRMASK_CR6TRUE +#undef XM_CRMASK_CR6FALSE +#undef XM_CRMASK_CR6BOUNDS +#undef XM_CACHE_LINE_SIZE +#endif + +const float XM_PI = 3.141592654f; +const float XM_2PI = 6.283185307f; +const float XM_1DIVPI = 0.318309886f; +const float XM_1DIV2PI = 0.159154943f; +const float XM_PIDIV2 = 1.570796327f; +const float XM_PIDIV4 = 0.785398163f; + +const uint32_t XM_SELECT_0 = 0x00000000; +const uint32_t XM_SELECT_1 = 0xFFFFFFFF; + +const uint32_t XM_PERMUTE_0X = 0; +const uint32_t XM_PERMUTE_0Y = 1; +const uint32_t XM_PERMUTE_0Z = 2; +const uint32_t XM_PERMUTE_0W = 3; +const uint32_t XM_PERMUTE_1X = 4; +const uint32_t XM_PERMUTE_1Y = 5; +const uint32_t XM_PERMUTE_1Z = 6; +const uint32_t XM_PERMUTE_1W = 7; + +const uint32_t XM_SWIZZLE_X = 0; +const uint32_t XM_SWIZZLE_Y = 1; +const uint32_t XM_SWIZZLE_Z = 2; +const uint32_t XM_SWIZZLE_W = 3; + +const uint32_t XM_CRMASK_CR6 = 0x000000F0; +const uint32_t XM_CRMASK_CR6TRUE = 0x00000080; +const uint32_t XM_CRMASK_CR6FALSE = 0x00000020; +const uint32_t XM_CRMASK_CR6BOUNDS = XM_CRMASK_CR6FALSE; + + + +/**************************************************************************** + * + * Macros + * + ****************************************************************************/ + +#if defined(__XNAMATH_H__) && defined(XMComparisonAllTrue) +#undef XMComparisonAllTrue +#undef XMComparisonAnyTrue +#undef XMComparisonAllFalse +#undef XMComparisonAnyFalse +#undef XMComparisonMixed +#undef XMComparisonAllInBounds +#undef XMComparisonAnyOutOfBounds +#endif + +// Unit conversion + +inline float XMConvertToRadians(float fDegrees) { return fDegrees * (XM_PI / 180.0f); } +inline float XMConvertToDegrees(float fRadians) { return fRadians * (180.0f / XM_PI); } + +// Condition register evaluation proceeding a recording (R) comparison + +inline bool XMComparisonAllTrue(uint32_t CR) { return (((CR) & XM_CRMASK_CR6TRUE) == XM_CRMASK_CR6TRUE); } +inline bool XMComparisonAnyTrue(uint32_t CR) { return (((CR) & XM_CRMASK_CR6FALSE) != XM_CRMASK_CR6FALSE); } +inline bool XMComparisonAllFalse(uint32_t CR) { return (((CR) & XM_CRMASK_CR6FALSE) == XM_CRMASK_CR6FALSE); } +inline bool XMComparisonAnyFalse(uint32_t CR) { return (((CR) & XM_CRMASK_CR6TRUE) != XM_CRMASK_CR6TRUE); } +inline bool XMComparisonMixed(uint32_t CR) { return (((CR) & XM_CRMASK_CR6) == 0); } +inline bool XMComparisonAllInBounds(uint32_t CR) { return (((CR) & XM_CRMASK_CR6BOUNDS) == XM_CRMASK_CR6BOUNDS); } +inline bool XMComparisonAnyOutOfBounds(uint32_t CR) { return (((CR) & XM_CRMASK_CR6BOUNDS) != XM_CRMASK_CR6BOUNDS); } + + +/**************************************************************************** + * + * Data types + * + ****************************************************************************/ + +#pragma warning(push) +#pragma warning(disable:4068 4201 4365 4324 4820) + +#pragma prefast(push) +#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes") + +#ifdef _XM_BIGENDIAN_ +#pragma bitfield_order(push) +#pragma bitfield_order(lsb_to_msb) +#endif + +//------------------------------------------------------------------------------ +#if defined(_XM_NO_INTRINSICS_) && !defined(_M_PPCBE) +// The __vector4 structure is an intrinsic on Xbox but must be separately defined +// for x86/x64 +struct __vector4 +{ + union + { + float vector4_f32[4]; + uint32_t vector4_u32[4]; + // MGH - added to match 360 version +//---------------------- +struct + { + float x; + float y; + float z; + float w; + }; + float v[4]; + uint32_t u[4]; +//---------------------- + + }; +}; +#endif // _XM_NO_INTRINSICS_ + +//------------------------------------------------------------------------------ +#if (defined (_M_IX86) || defined(_M_AMD64) || defined(_M_ARM)) && defined(_XM_NO_INTRINSICS_) +typedef uint32_t __vector4i[4]; +#else +typedef __declspec(align(16)) uint32_t __vector4i[4]; +#endif + +//------------------------------------------------------------------------------ +// Vector intrinsic: Four 32 bit floating point components aligned on a 16 byte +// boundary and mapped to hardware vector registers +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +typedef __m128 XMVECTOR; +#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +typedef __n128 XMVECTOR; +#else +typedef __vector4 XMVECTOR; +#endif + +// Fix-up for (1st-3rd) XMVECTOR parameters that are pass-in-register for x86, ARM, and Xbox 360; by reference otherwise +#if ( defined(_M_IX86) || defined(_M_ARM) || defined(_XM_VMX128_INTRINSICS_) ) && !defined(_XM_NO_INTRINSICS_) +typedef const XMVECTOR FXMVECTOR; +#else +typedef const XMVECTOR& FXMVECTOR; +#endif + +// Fix-up for (4th) XMVECTOR parameter to pass in-register for ARM and Xbox 360; by reference otherwise +#if ( defined(_M_ARM) || defined(_XM_VMX128_INTRINSICS_) ) && !defined(_XM_NO_INTRINSICS_) +typedef const XMVECTOR GXMVECTOR; +#else +typedef const XMVECTOR& GXMVECTOR; +#endif + +// Fix-up for (5th+) XMVECTOR parameters to pass in-register for Xbox 360 and by reference otherwise +#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +typedef const XMVECTOR CXMVECTOR; +#else +typedef const XMVECTOR& CXMVECTOR; +#endif + +//------------------------------------------------------------------------------ +// Conversion types for constants +__declspec(align(16)) struct XMVECTORF32 +{ + union + { + float f[4]; + XMVECTOR v; + }; + + inline operator XMVECTOR() const { return v; } + inline operator const float*() const { return f; } +#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_) + inline operator __m128i() const { return _mm_castps_si128(v); } + inline operator __m128d() const { return _mm_castps_pd(v); } +#endif +}; + +__declspec(align(16)) struct XMVECTORI32 +{ + union + { + int32_t i[4]; + XMVECTOR v; + }; + + inline operator XMVECTOR() const { return v; } +#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_) + inline operator __m128i() const { return _mm_castps_si128(v); } + inline operator __m128d() const { return _mm_castps_pd(v); } +#endif +}; + +__declspec(align(16)) struct XMVECTORU8 +{ + union + { + uint8_t u[16]; + XMVECTOR v; + }; + + inline operator XMVECTOR() const { return v; } +#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_) + inline operator __m128i() const { return _mm_castps_si128(v); } + inline operator __m128d() const { return _mm_castps_pd(v); } +#endif +}; + +__declspec(align(16)) struct XMVECTORU32 +{ + union + { + uint32_t u[4]; + XMVECTOR v; + }; + + inline operator XMVECTOR() const { return v; } +#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_SSE_INTRINSICS_) + inline operator __m128i() const { return _mm_castps_si128(v); } + inline operator __m128d() const { return _mm_castps_pd(v); } +#endif +}; + +//------------------------------------------------------------------------------ +// Vector operators +XMVECTOR operator+ (FXMVECTOR V); +XMVECTOR operator- (FXMVECTOR V); + +XMVECTOR& operator+= (XMVECTOR& V1, FXMVECTOR V2); +XMVECTOR& operator-= (XMVECTOR& V1, FXMVECTOR V2); +XMVECTOR& operator*= (XMVECTOR& V1, FXMVECTOR V2); +XMVECTOR& operator/= (XMVECTOR& V1, FXMVECTOR V2); +XMVECTOR& operator*= (XMVECTOR& V, float S); +XMVECTOR& operator/= (XMVECTOR& V, float S); + +XMVECTOR operator+ (FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR operator- (FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR operator* (FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR operator/ (FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR operator* (FXMVECTOR V, float S); +XMVECTOR operator* (float S, FXMVECTOR V); +XMVECTOR operator/ (FXMVECTOR V, float S); + +//------------------------------------------------------------------------------ +// Matrix type: Sixteen 32 bit floating point components aligned on a +// 16 byte boundary and mapped to four hardware vector registers + +struct XMMATRIX; + +// Fix-up for XMMATRIX parameters to pass in-register on Xbox 360, by reference otherwise +#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +typedef const XMMATRIX CXMMATRIX; +#else +typedef const XMMATRIX& CXMMATRIX; +#endif + +#if (defined(_M_IX86) || defined(_M_AMD64) || defined(_M_ARM)) && defined(_XM_NO_INTRINSICS_) +struct XMMATRIX +#else +__declspec(align(16)) struct XMMATRIX +#endif +{ +#ifdef _XM_NO_INTRINSICS_ + union + { + XMVECTOR r[4]; + struct + { + float _11, _12, _13, _14; + float _21, _22, _23, _24; + float _31, _32, _33, _34; + float _41, _42, _43, _44; + }; + float m[4][4]; + }; +#else + XMVECTOR r[4]; +#endif + + XMMATRIX() {} + XMMATRIX(FXMVECTOR R0, FXMVECTOR R1, FXMVECTOR R2, GXMVECTOR R3) { r[0] = R0; r[1] = R1; r[2] = R2; r[3] = R3; } + XMMATRIX(float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33); + explicit XMMATRIX(_In_reads_(16) const float *pArray); + +#ifdef _XM_NO_INTRINSICS_ + float operator() (size_t Row, size_t Column) const { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) { return m[Row][Column]; } +#endif + + XMMATRIX& operator= (const XMMATRIX& M) { r[0] = M.r[0]; r[1] = M.r[1]; r[2] = M.r[2]; r[3] = M.r[3]; return *this; } + + XMMATRIX operator+ () const { return *this; } + XMMATRIX operator- () const; + + XMMATRIX& operator+= (CXMMATRIX M); + XMMATRIX& operator-= (CXMMATRIX M); + XMMATRIX& operator*= (CXMMATRIX M); + XMMATRIX& operator*= (float S); + XMMATRIX& operator/= (float S); + + XMMATRIX operator+ (CXMMATRIX M) const; + XMMATRIX operator- (CXMMATRIX M) const; + XMMATRIX operator* (CXMMATRIX M) const; + XMMATRIX operator* (float S) const; + XMMATRIX operator/ (float S) const; + + friend XMMATRIX operator* (float S, CXMMATRIX M); +}; + +//------------------------------------------------------------------------------ +// 2D Vector; 32 bit floating point components +struct XMFLOAT2 +{ + float x; + float y; + + XMFLOAT2() {} + XMFLOAT2(float _x, float _y) : x(_x), y(_y) {} + explicit XMFLOAT2(_In_reads_(2) const float *pArray) : x(pArray[0]), y(pArray[1]) {} + + XMFLOAT2& operator= (const XMFLOAT2& Float2) { x = Float2.x; y = Float2.y; return *this; } +}; + +// 2D Vector; 32 bit floating point components aligned on a 16 byte boundary +__declspec(align(16)) struct XMFLOAT2A : public XMFLOAT2 +{ + XMFLOAT2A() : XMFLOAT2() {} + XMFLOAT2A(float _x, float _y) : XMFLOAT2(_x, _y) {} + explicit XMFLOAT2A(_In_reads_(2) const float *pArray) : XMFLOAT2(pArray) {} + + XMFLOAT2A& operator= (const XMFLOAT2A& Float2) { x = Float2.x; y = Float2.y; return *this; } +}; + +//------------------------------------------------------------------------------ +// 2D Vector; 32 bit signed integer components +struct XMINT2 +{ + int32_t x; + int32_t y; + + XMINT2() {} + XMINT2(int32_t _x, int32_t _y) : x(_x), y(_y) {} + explicit XMINT2(_In_reads_(2) const int32_t *pArray) : x(pArray[0]), y(pArray[1]) {} + + XMINT2& operator= (const XMINT2& Int2) { x = Int2.x; y = Int2.y; return *this; } +}; + +// 2D Vector; 32 bit unsigned integer components +struct XMUINT2 +{ + uint32_t x; + uint32_t y; + + XMUINT2() {} + XMUINT2(uint32_t _x, uint32_t _y) : x(_x), y(_y) {} + explicit XMUINT2(_In_reads_(2) const uint32_t *pArray) : x(pArray[0]), y(pArray[1]) {} + + XMUINT2& operator= (const XMUINT2& UInt2) { x = UInt2.x; y = UInt2.y; return *this; } +}; + +//------------------------------------------------------------------------------ +// 3D Vector; 32 bit floating point components +struct XMFLOAT3 +{ + float x; + float y; + float z; + + XMFLOAT3() {} + XMFLOAT3(float _x, float _y, float _z) : x(_x), y(_y), z(_z) {} + explicit XMFLOAT3(_In_reads_(3) const float *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {} + + XMFLOAT3& operator= (const XMFLOAT3& Float3) { x = Float3.x; y = Float3.y; z = Float3.z; return *this; } +}; + +// 3D Vector; 32 bit floating point components aligned on a 16 byte boundary +__declspec(align(16)) struct XMFLOAT3A : public XMFLOAT3 +{ + XMFLOAT3A() : XMFLOAT3() {} + XMFLOAT3A(float _x, float _y, float _z) : XMFLOAT3(_x, _y, _z) {} + explicit XMFLOAT3A(_In_reads_(3) const float *pArray) : XMFLOAT3(pArray) {} + + XMFLOAT3A& operator= (const XMFLOAT3A& Float3) { x = Float3.x; y = Float3.y; z = Float3.z; return *this; } +}; + +//------------------------------------------------------------------------------ +// 3D Vector; 32 bit signed integer components +struct XMINT3 +{ + int32_t x; + int32_t y; + int32_t z; + + XMINT3() {} + XMINT3(int32_t _x, int32_t _y, int32_t _z) : x(_x), y(_y), z(_z) {} + explicit XMINT3(_In_reads_(3) const int32_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {} + + XMINT3& operator= (const XMINT3& i3) { x = i3.x; y = i3.y; z = i3.z; return *this; } +}; + +// 3D Vector; 32 bit unsigned integer components +struct XMUINT3 +{ + uint32_t x; + uint32_t y; + uint32_t z; + + XMUINT3() {} + XMUINT3(uint32_t _x, uint32_t _y, uint32_t _z) : x(_x), y(_y), z(_z) {} + explicit XMUINT3(_In_reads_(3) const uint32_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {} + + XMUINT3& operator= (const XMUINT3& u3) { x = u3.x; y = u3.y; z = u3.z; return *this; } +}; + +//------------------------------------------------------------------------------ +// 4D Vector; 32 bit floating point components +struct XMFLOAT4 +{ + float x; + float y; + float z; + float w; + + XMFLOAT4() {} + XMFLOAT4(float _x, float _y, float _z, float _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMFLOAT4(_In_reads_(4) const float *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + + XMFLOAT4& operator= (const XMFLOAT4& Float4) { x = Float4.x; y = Float4.y; z = Float4.z; w = Float4.w; return *this; } +}; + +// 4D Vector; 32 bit floating point components aligned on a 16 byte boundary +__declspec(align(16)) struct XMFLOAT4A : public XMFLOAT4 +{ + XMFLOAT4A() : XMFLOAT4() {} + XMFLOAT4A(float _x, float _y, float _z, float _w) : XMFLOAT4(_x, _y, _z, _w) {} + explicit XMFLOAT4A(_In_reads_(4) const float *pArray) : XMFLOAT4(pArray) {} + + XMFLOAT4A& operator= (const XMFLOAT4A& Float4) { x = Float4.x; y = Float4.y; z = Float4.z; w = Float4.w; return *this; } +}; + +//------------------------------------------------------------------------------ +// 4D Vector; 32 bit signed integer components +struct XMINT4 +{ + int32_t x; + int32_t y; + int32_t z; + int32_t w; + + XMINT4() {} + XMINT4(int32_t _x, int32_t _y, int32_t _z, int32_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMINT4(_In_reads_(4) const int32_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + + XMINT4& operator= (const XMINT4& Int4) { x = Int4.x; y = Int4.y; z = Int4.z; w = Int4.w; return *this; } +}; + +// 4D Vector; 32 bit unsigned integer components +struct XMUINT4 +{ + uint32_t x; + uint32_t y; + uint32_t z; + uint32_t w; + + XMUINT4() {} + XMUINT4(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMUINT4(_In_reads_(4) const uint32_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + + XMUINT4& operator= (const XMUINT4& UInt4) { x = UInt4.x; y = UInt4.y; z = UInt4.z; w = UInt4.w; return *this; } +}; + +//------------------------------------------------------------------------------ +// 3x3 Matrix: 32 bit floating point components +struct XMFLOAT3X3 +{ + union + { + struct + { + float _11, _12, _13; + float _21, _22, _23; + float _31, _32, _33; + }; + float m[3][3]; + }; + + XMFLOAT3X3() {} + XMFLOAT3X3(float m00, float m01, float m02, + float m10, float m11, float m12, + float m20, float m21, float m22); + explicit XMFLOAT3X3(_In_reads_(9) const float *pArray); + + float operator() (size_t Row, size_t Column) const { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) { return m[Row][Column]; } + + XMFLOAT3X3& operator= (const XMFLOAT3X3& Float3x3); +}; + +//------------------------------------------------------------------------------ +// 4x3 Matrix: 32 bit floating point components +struct XMFLOAT4X3 +{ + union + { + struct + { + float _11, _12, _13; + float _21, _22, _23; + float _31, _32, _33; + float _41, _42, _43; + }; + float m[4][3]; + }; + + XMFLOAT4X3() {} + XMFLOAT4X3(float m00, float m01, float m02, + float m10, float m11, float m12, + float m20, float m21, float m22, + float m30, float m31, float m32); + explicit XMFLOAT4X3(_In_reads_(12) const float *pArray); + + float operator() (size_t Row, size_t Column) const { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) { return m[Row][Column]; } + + XMFLOAT4X3& operator= (const XMFLOAT4X3& Float4x3); + +}; + +// 4x3 Matrix: 32 bit floating point components aligned on a 16 byte boundary +__declspec(align(16)) struct XMFLOAT4X3A : public XMFLOAT4X3 +{ + XMFLOAT4X3A() : XMFLOAT4X3() {} + XMFLOAT4X3A(float m00, float m01, float m02, + float m10, float m11, float m12, + float m20, float m21, float m22, + float m30, float m31, float m32) : + XMFLOAT4X3(m00,m01,m02,m10,m11,m12,m20,m21,m22,m30,m31,m32) {} + explicit XMFLOAT4X3A(_In_reads_(12) const float *pArray) : XMFLOAT4X3(pArray) {} + + float operator() (size_t Row, size_t Column) const { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) { return m[Row][Column]; } + + XMFLOAT4X3A& operator= (const XMFLOAT4X3A& Float4x3); +}; + +//------------------------------------------------------------------------------ +// 4x4 Matrix: 32 bit floating point components +struct XMFLOAT4X4 +{ + union + { + struct + { + float _11, _12, _13, _14; + float _21, _22, _23, _24; + float _31, _32, _33, _34; + float _41, _42, _43, _44; + }; + float m[4][4]; + }; + + XMFLOAT4X4() {} + XMFLOAT4X4(float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33); + explicit XMFLOAT4X4(_In_reads_(16) const float *pArray); + + float operator() (size_t Row, size_t Column) const { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) { return m[Row][Column]; } + + XMFLOAT4X4& operator= (const XMFLOAT4X4& Float4x4); +}; + +// 4x4 Matrix: 32 bit floating point components aligned on a 16 byte boundary +__declspec(align(16)) struct XMFLOAT4X4A : public XMFLOAT4X4 +{ + XMFLOAT4X4A() : XMFLOAT4X4() {} + XMFLOAT4X4A(float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33) + : XMFLOAT4X4(m00,m01,m02,m03,m10,m11,m12,m13,m20,m21,m22,m23,m30,m31,m32,m33) {} + explicit XMFLOAT4X4A(_In_reads_(16) const float *pArray) : XMFLOAT4X4(pArray) {} + + float operator() (size_t Row, size_t Column) const { return m[Row][Column]; } + float& operator() (size_t Row, size_t Column) { return m[Row][Column]; } + + XMFLOAT4X4A& operator= (const XMFLOAT4X4A& Float4x4); +}; + +//////////////////////////////////////////////////////////////////////////////// + + +#ifdef _XM_BIGENDIAN_ +#pragma bitfield_order(pop) +#endif + +#pragma prefast(pop) +#pragma warning(pop) + +/**************************************************************************** + * + * Data conversion operations + * + ****************************************************************************/ + +#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_VMX128_INTRINSICS_) +#else +XMVECTOR XMConvertVectorIntToFloat(FXMVECTOR VInt, uint32_t DivExponent); +XMVECTOR XMConvertVectorFloatToInt(FXMVECTOR VFloat, uint32_t MulExponent); +XMVECTOR XMConvertVectorUIntToFloat(FXMVECTOR VUInt, uint32_t DivExponent); +XMVECTOR XMConvertVectorFloatToUInt(FXMVECTOR VFloat, uint32_t MulExponent); +#endif + +#if !defined(_XM_NO_INTRINSICS_) && defined(_XM_VMX128_INTRINSICS_) +#else + +#if defined(__XNAMATH_H__) && defined(XMVectorSetBinaryConstant) +#undef XMVectorSetBinaryConstant +#undef XMVectorSplatConstant +#undef XMVectorSplatConstantInt +#endif + +XMVECTOR XMVectorSetBinaryConstant(uint32_t C0, uint32_t C1, uint32_t C2, uint32_t C3); +XMVECTOR XMVectorSplatConstant(int32_t IntConstant, uint32_t DivExponent); +XMVECTOR XMVectorSplatConstantInt(int32_t IntConstant); +#endif + +/**************************************************************************** + * + * Load operations + * + ****************************************************************************/ + +XMVECTOR XMLoadInt(_In_ const uint32_t* pSource); +XMVECTOR XMLoadFloat(_In_ const float* pSource); + +XMVECTOR XMLoadInt2(_In_reads_(2) const uint32_t* pSource); +XMVECTOR XMLoadInt2A(_In_reads_(2) const uint32_t* PSource); +XMVECTOR XMLoadFloat2(_In_ const XMFLOAT2* pSource); +XMVECTOR XMLoadFloat2A(_In_ const XMFLOAT2A* pSource); +XMVECTOR XMLoadSInt2(_In_ const XMINT2* pSource); +XMVECTOR XMLoadUInt2(_In_ const XMUINT2* pSource); + +XMVECTOR XMLoadInt3(_In_reads_(3) const uint32_t* pSource); +XMVECTOR XMLoadInt3A(_In_reads_(3) const uint32_t* pSource); +XMVECTOR XMLoadFloat3(_In_ const XMFLOAT3* pSource); +XMVECTOR XMLoadFloat3A(_In_ const XMFLOAT3A* pSource); +XMVECTOR XMLoadSInt3(_In_ const XMINT3* pSource); +XMVECTOR XMLoadUInt3(_In_ const XMUINT3* pSource); + +XMVECTOR XMLoadInt4(_In_reads_(4) const uint32_t* pSource); +XMVECTOR XMLoadInt4A(_In_reads_(4) const uint32_t* pSource); +XMVECTOR XMLoadFloat4(_In_ const XMFLOAT4* pSource); +XMVECTOR XMLoadFloat4A(_In_ const XMFLOAT4A* pSource); +XMVECTOR XMLoadSInt4(_In_ const XMINT4* pSource); +XMVECTOR XMLoadUInt4(_In_ const XMUINT4* pSource); + +XMMATRIX XMLoadFloat3x3(_In_ const XMFLOAT3X3* pSource); +XMMATRIX XMLoadFloat4x3(_In_ const XMFLOAT4X3* pSource); +XMMATRIX XMLoadFloat4x3A(_In_ const XMFLOAT4X3A* pSource); +XMMATRIX XMLoadFloat4x4(_In_ const XMFLOAT4X4* pSource); +XMMATRIX XMLoadFloat4x4A(_In_ const XMFLOAT4X4A* pSource); + +/**************************************************************************** + * + * Store operations + * + ****************************************************************************/ + +void XMStoreInt(_Out_ uint32_t* pDestination, _In_ FXMVECTOR V); +void XMStoreFloat(_Out_ float* pDestination, _In_ FXMVECTOR V); + +void XMStoreInt2(_Out_writes_(2) uint32_t* pDestination, _In_ FXMVECTOR V); +void XMStoreInt2A(_Out_writes_(2) uint32_t* pDestination, _In_ FXMVECTOR V); +void XMStoreFloat2(_Out_ XMFLOAT2* pDestination, _In_ FXMVECTOR V); +void XMStoreFloat2A(_Out_ XMFLOAT2A* pDestination, _In_ FXMVECTOR V); +void XMStoreSInt2(_Out_ XMINT2* pDestination, _In_ FXMVECTOR V); +void XMStoreUInt2(_Out_ XMUINT2* pDestination, _In_ FXMVECTOR V); + +void XMStoreInt3(_Out_writes_(3) uint32_t* pDestination, _In_ FXMVECTOR V); +void XMStoreInt3A(_Out_writes_(3) uint32_t* pDestination, _In_ FXMVECTOR V); +void XMStoreFloat3(_Out_ XMFLOAT3* pDestination, _In_ FXMVECTOR V); +void XMStoreFloat3A(_Out_ XMFLOAT3A* pDestination, _In_ FXMVECTOR V); +void XMStoreSInt3(_Out_ XMINT3* pDestination, _In_ FXMVECTOR V); +void XMStoreUInt3(_Out_ XMUINT3* pDestination, _In_ FXMVECTOR V); + +void XMStoreInt4(_Out_writes_(4) uint32_t* pDestination, _In_ FXMVECTOR V); +void XMStoreInt4A(_Out_writes_(4) uint32_t* pDestination, _In_ FXMVECTOR V); +void XMStoreFloat4(_Out_ XMFLOAT4* pDestination, _In_ FXMVECTOR V); +void XMStoreFloat4A(_Out_ XMFLOAT4A* pDestination, _In_ FXMVECTOR V); +void XMStoreSInt4(_Out_ XMINT4* pDestination, _In_ FXMVECTOR V); +void XMStoreUInt4(_Out_ XMUINT4* pDestination, _In_ FXMVECTOR V); + +void XMStoreFloat3x3(_Out_ XMFLOAT3X3* pDestination, _In_ CXMMATRIX M); +void XMStoreFloat4x3(_Out_ XMFLOAT4X3* pDestination, _In_ CXMMATRIX M); +void XMStoreFloat4x3A(_Out_ XMFLOAT4X3A* pDestination, _In_ CXMMATRIX M); +void XMStoreFloat4x4(_Out_ XMFLOAT4X4* pDestination, _In_ CXMMATRIX M); +void XMStoreFloat4x4A(_Out_ XMFLOAT4X4A* pDestination, _In_ CXMMATRIX M); + +/**************************************************************************** + * + * General vector operations + * + ****************************************************************************/ + +XMVECTOR XMVectorZero(); +XMVECTOR XMVectorSet(float x, float y, float z, float w); +XMVECTOR XMVectorSetInt(uint32_t x, uint32_t y, uint32_t z, uint32_t w); +XMVECTOR XMVectorReplicate(float Value); +XMVECTOR XMVectorReplicatePtr(_In_ const float *pValue); +XMVECTOR XMVectorReplicateInt(uint32_t Value); +XMVECTOR XMVectorReplicateIntPtr(_In_ const uint32_t *pValue); +XMVECTOR XMVectorTrueInt(); +XMVECTOR XMVectorFalseInt(); +XMVECTOR XMVectorSplatX(FXMVECTOR V); +XMVECTOR XMVectorSplatY(FXMVECTOR V); +XMVECTOR XMVectorSplatZ(FXMVECTOR V); +XMVECTOR XMVectorSplatW(FXMVECTOR V); +XMVECTOR XMVectorSplatOne(); +XMVECTOR XMVectorSplatInfinity(); +XMVECTOR XMVectorSplatQNaN(); +XMVECTOR XMVectorSplatEpsilon(); +XMVECTOR XMVectorSplatSignMask(); + +float XMVectorGetByIndex(FXMVECTOR V, size_t i); +float XMVectorGetX(FXMVECTOR V); +float XMVectorGetY(FXMVECTOR V); +float XMVectorGetZ(FXMVECTOR V); +float XMVectorGetW(FXMVECTOR V); + +void XMVectorGetByIndexPtr(_Out_ float *f, _In_ FXMVECTOR V, _In_ size_t i); +void XMVectorGetXPtr(_Out_ float *x, _In_ FXMVECTOR V); +void XMVectorGetYPtr(_Out_ float *y, _In_ FXMVECTOR V); +void XMVectorGetZPtr(_Out_ float *z, _In_ FXMVECTOR V); +void XMVectorGetWPtr(_Out_ float *w, _In_ FXMVECTOR V); + +uint32_t XMVectorGetIntByIndex(FXMVECTOR V, size_t i); +uint32_t XMVectorGetIntX(FXMVECTOR V); +uint32_t XMVectorGetIntY(FXMVECTOR V); +uint32_t XMVectorGetIntZ(FXMVECTOR V); +uint32_t XMVectorGetIntW(FXMVECTOR V); + +void XMVectorGetIntByIndexPtr(_Out_ uint32_t *x, _In_ FXMVECTOR V, _In_ size_t i); +void XMVectorGetIntXPtr(_Out_ uint32_t *x, _In_ FXMVECTOR V); +void XMVectorGetIntYPtr(_Out_ uint32_t *y, _In_ FXMVECTOR V); +void XMVectorGetIntZPtr(_Out_ uint32_t *z, _In_ FXMVECTOR V); +void XMVectorGetIntWPtr(_Out_ uint32_t *w, _In_ FXMVECTOR V); + +XMVECTOR XMVectorSetByIndex(FXMVECTOR V,float f, size_t i); +XMVECTOR XMVectorSetX(FXMVECTOR V, float x); +XMVECTOR XMVectorSetY(FXMVECTOR V, float y); +XMVECTOR XMVectorSetZ(FXMVECTOR V, float z); +XMVECTOR XMVectorSetW(FXMVECTOR V, float w); + +XMVECTOR XMVectorSetByIndexPtr(_In_ FXMVECTOR V, _In_ const float *f, _In_ size_t i); +XMVECTOR XMVectorSetXPtr(_In_ FXMVECTOR V, _In_ const float *x); +XMVECTOR XMVectorSetYPtr(_In_ FXMVECTOR V, _In_ const float *y); +XMVECTOR XMVectorSetZPtr(_In_ FXMVECTOR V, _In_ const float *z); +XMVECTOR XMVectorSetWPtr(_In_ FXMVECTOR V, _In_ const float *w); + +XMVECTOR XMVectorSetIntByIndex(FXMVECTOR V, uint32_t x, size_t i); +XMVECTOR XMVectorSetIntX(FXMVECTOR V, uint32_t x); +XMVECTOR XMVectorSetIntY(FXMVECTOR V, uint32_t y); +XMVECTOR XMVectorSetIntZ(FXMVECTOR V, uint32_t z); +XMVECTOR XMVectorSetIntW(FXMVECTOR V, uint32_t w); + +XMVECTOR XMVectorSetIntByIndexPtr(_In_ FXMVECTOR V, _In_ const uint32_t *x, _In_ size_t i); +XMVECTOR XMVectorSetIntXPtr(_In_ FXMVECTOR V, _In_ const uint32_t *x); +XMVECTOR XMVectorSetIntYPtr(_In_ FXMVECTOR V, _In_ const uint32_t *y); +XMVECTOR XMVectorSetIntZPtr(_In_ FXMVECTOR V, _In_ const uint32_t *z); +XMVECTOR XMVectorSetIntWPtr(_In_ FXMVECTOR V, _In_ const uint32_t *w); + +#if defined(__XNAMATH_H__) && defined(XMVectorSwizzle) +#undef XMVectorSwizzle +#endif + +XMVECTOR XMVectorSwizzle(FXMVECTOR V, uint32_t E0, uint32_t E1, uint32_t E2, uint32_t E3); +XMVECTOR XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2, uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW); +XMVECTOR XMVectorSelectControl(uint32_t VectorIndex0, uint32_t VectorIndex1, uint32_t VectorIndex2, uint32_t VectorIndex3); +XMVECTOR XMVectorSelect(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Control); +XMVECTOR XMVectorMergeXY(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorMergeZW(FXMVECTOR V1, FXMVECTOR V2); + +#if defined(__XNAMATH_H__) && defined(XMVectorShiftLeft) +#undef XMVectorShiftLeft +#undef XMVectorRotateLeft +#undef XMVectorRotateRight +#undef XMVectorInsert +#endif + +XMVECTOR XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements); +XMVECTOR XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements); +XMVECTOR XMVectorRotateRight(FXMVECTOR V, uint32_t Elements); +XMVECTOR XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS, uint32_t VSLeftRotateElements, + uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3); + +XMVECTOR XMVectorEqual(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorEqualR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2); +XMVECTOR XMVectorEqualInt(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorEqualIntR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V, _In_ FXMVECTOR V2); +XMVECTOR XMVectorNearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon); +XMVECTOR XMVectorNotEqual(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorNotEqualInt(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorGreater(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorGreaterR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2); +XMVECTOR XMVectorGreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorGreaterOrEqualR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V1, _In_ FXMVECTOR V2); +XMVECTOR XMVectorLess(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorLessOrEqual(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorInBounds(FXMVECTOR V, FXMVECTOR Bounds); +XMVECTOR XMVectorInBoundsR(_Out_ uint32_t* pCR, _In_ FXMVECTOR V, _In_ FXMVECTOR Bounds); + +XMVECTOR XMVectorIsNaN(FXMVECTOR V); +XMVECTOR XMVectorIsInfinite(FXMVECTOR V); + +XMVECTOR XMVectorMin(FXMVECTOR V1,FXMVECTOR V2); +XMVECTOR XMVectorMax(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorRound(FXMVECTOR V); +XMVECTOR XMVectorTruncate(FXMVECTOR V); +XMVECTOR XMVectorFloor(FXMVECTOR V); +XMVECTOR XMVectorCeiling(FXMVECTOR V); +XMVECTOR XMVectorClamp(FXMVECTOR V, FXMVECTOR Min, FXMVECTOR Max); +XMVECTOR XMVectorSaturate(FXMVECTOR V); + +XMVECTOR XMVectorAndInt(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorAndCInt(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorOrInt(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorNorInt(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorXorInt(FXMVECTOR V1, FXMVECTOR V2); + +XMVECTOR XMVectorNegate(FXMVECTOR V); +XMVECTOR XMVectorAdd(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorAddAngles(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorSubtract(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorSubtractAngles(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorMultiply(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorMultiplyAdd(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3); +XMVECTOR XMVectorDivide(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorNegativeMultiplySubtract(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3); +XMVECTOR XMVectorScale(FXMVECTOR V, float ScaleFactor); +XMVECTOR XMVectorReciprocalEst(FXMVECTOR V); +XMVECTOR XMVectorReciprocal(FXMVECTOR V); +XMVECTOR XMVectorSqrtEst(FXMVECTOR V); +XMVECTOR XMVectorSqrt(FXMVECTOR V); +XMVECTOR XMVectorReciprocalSqrtEst(FXMVECTOR V); +XMVECTOR XMVectorReciprocalSqrt(FXMVECTOR V); +XMVECTOR XMVectorExp(FXMVECTOR V); +XMVECTOR XMVectorLog(FXMVECTOR V); +XMVECTOR XMVectorPow(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorAbs(FXMVECTOR V); +XMVECTOR XMVectorMod(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVectorModAngles(FXMVECTOR Angles); +XMVECTOR XMVectorSin(FXMVECTOR V); +XMVECTOR XMVectorSinEst(FXMVECTOR V); +XMVECTOR XMVectorCos(FXMVECTOR V); +XMVECTOR XMVectorCosEst(FXMVECTOR V); +void XMVectorSinCos(_Out_ XMVECTOR* pSin, _Out_ XMVECTOR* pCos, _In_ FXMVECTOR V); +void XMVectorSinCosEst(_Out_ XMVECTOR* pSin, _Out_ XMVECTOR* pCos, _In_ FXMVECTOR V); +XMVECTOR XMVectorTan(FXMVECTOR V); +XMVECTOR XMVectorTanEst(FXMVECTOR V); +XMVECTOR XMVectorSinH(FXMVECTOR V); +XMVECTOR XMVectorCosH(FXMVECTOR V); +XMVECTOR XMVectorTanH(FXMVECTOR V); +XMVECTOR XMVectorASin(FXMVECTOR V); +XMVECTOR XMVectorASinEst(FXMVECTOR V); +XMVECTOR XMVectorACos(FXMVECTOR V); +XMVECTOR XMVectorACosEst(FXMVECTOR V); +XMVECTOR XMVectorATan(FXMVECTOR V); +XMVECTOR XMVectorATanEst(FXMVECTOR V); +XMVECTOR XMVectorATan2(FXMVECTOR Y, FXMVECTOR X); +XMVECTOR XMVectorATan2Est(FXMVECTOR Y, FXMVECTOR X); +XMVECTOR XMVectorLerp(FXMVECTOR V0, FXMVECTOR V1, float t); +XMVECTOR XMVectorLerpV(FXMVECTOR V0, FXMVECTOR V1, FXMVECTOR T); +XMVECTOR XMVectorHermite(FXMVECTOR Position0, FXMVECTOR Tangent0, FXMVECTOR Position1, GXMVECTOR Tangent1, float t); +XMVECTOR XMVectorHermiteV(FXMVECTOR Position0, FXMVECTOR Tangent0, FXMVECTOR Position1, GXMVECTOR Tangent1, CXMVECTOR T); +XMVECTOR XMVectorCatmullRom(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR Position3, float t); +XMVECTOR XMVectorCatmullRomV(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR Position3, CXMVECTOR T); +XMVECTOR XMVectorBaryCentric(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, float f, float g); +XMVECTOR XMVectorBaryCentricV(FXMVECTOR Position0, FXMVECTOR Position1, FXMVECTOR Position2, GXMVECTOR F, CXMVECTOR G); + +/**************************************************************************** + * + * 2D vector operations + * + ****************************************************************************/ + +bool XMVector2Equal(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XMVector2EqualR(FXMVECTOR V1, FXMVECTOR V2); +bool XMVector2EqualInt(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XMVector2EqualIntR(FXMVECTOR V1, FXMVECTOR V2); +bool XMVector2NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon); +bool XMVector2NotEqual(FXMVECTOR V1, FXMVECTOR V2); +bool XMVector2NotEqualInt(FXMVECTOR V1, FXMVECTOR V2); +bool XMVector2Greater(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XMVector2GreaterR(FXMVECTOR V1, FXMVECTOR V2); +bool XMVector2GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XMVector2GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2); +bool XMVector2Less(FXMVECTOR V1, FXMVECTOR V2); +bool XMVector2LessOrEqual(FXMVECTOR V1, FXMVECTOR V2); +bool XMVector2InBounds(FXMVECTOR V, FXMVECTOR Bounds); + +bool XMVector2IsNaN(FXMVECTOR V); +bool XMVector2IsInfinite(FXMVECTOR V); + +XMVECTOR XMVector2Dot(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVector2Cross(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVector2LengthSq(FXMVECTOR V); +XMVECTOR XMVector2ReciprocalLengthEst(FXMVECTOR V); +XMVECTOR XMVector2ReciprocalLength(FXMVECTOR V); +XMVECTOR XMVector2LengthEst(FXMVECTOR V); +XMVECTOR XMVector2Length(FXMVECTOR V); +XMVECTOR XMVector2NormalizeEst(FXMVECTOR V); +XMVECTOR XMVector2Normalize(FXMVECTOR V); +XMVECTOR XMVector2ClampLength(FXMVECTOR V, float LengthMin, float LengthMax); +XMVECTOR XMVector2ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax); +XMVECTOR XMVector2Reflect(FXMVECTOR Incident, FXMVECTOR Normal); +XMVECTOR XMVector2Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex); +XMVECTOR XMVector2RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex); +XMVECTOR XMVector2Orthogonal(FXMVECTOR V); +XMVECTOR XMVector2AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2); +XMVECTOR XMVector2AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2); +XMVECTOR XMVector2AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVector2LinePointDistance(FXMVECTOR LinePoint1, FXMVECTOR LinePoint2, FXMVECTOR Point); +XMVECTOR XMVector2IntersectLine(FXMVECTOR Line1Point1, FXMVECTOR Line1Point2, FXMVECTOR Line2Point1, GXMVECTOR Line2Point2); +XMVECTOR XMVector2Transform(FXMVECTOR V, CXMMATRIX M); +XMFLOAT4* XMVector2TransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4)+OutputStride*(VectorCount-1)) XMFLOAT4* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT2)+InputStride*(VectorCount-1)) const XMFLOAT2* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ CXMMATRIX M); +XMVECTOR XMVector2TransformCoord(FXMVECTOR V, CXMMATRIX M); +XMFLOAT2* XMVector2TransformCoordStream(_Out_writes_bytes_(sizeof(XMFLOAT2)+OutputStride*(VectorCount-1)) XMFLOAT2* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT2)+InputStride*(VectorCount-1)) const XMFLOAT2* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ CXMMATRIX M); +XMVECTOR XMVector2TransformNormal(FXMVECTOR V, CXMMATRIX M); +XMFLOAT2* XMVector2TransformNormalStream(_Out_writes_bytes_(sizeof(XMFLOAT2)+OutputStride*(VectorCount-1)) XMFLOAT2* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT2)+InputStride*(VectorCount-1)) const XMFLOAT2* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ CXMMATRIX M); + +/**************************************************************************** + * + * 3D vector operations + * + ****************************************************************************/ + +bool XMVector3Equal(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XMVector3EqualR(FXMVECTOR V1, FXMVECTOR V2); +bool XMVector3EqualInt(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XMVector3EqualIntR(FXMVECTOR V1, FXMVECTOR V2); +bool XMVector3NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon); +bool XMVector3NotEqual(FXMVECTOR V1, FXMVECTOR V2); +bool XMVector3NotEqualInt(FXMVECTOR V1, FXMVECTOR V2); +bool XMVector3Greater(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XMVector3GreaterR(FXMVECTOR V1, FXMVECTOR V2); +bool XMVector3GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XMVector3GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2); +bool XMVector3Less(FXMVECTOR V1, FXMVECTOR V2); +bool XMVector3LessOrEqual(FXMVECTOR V1, FXMVECTOR V2); +bool XMVector3InBounds(FXMVECTOR V, FXMVECTOR Bounds); + +bool XMVector3IsNaN(FXMVECTOR V); +bool XMVector3IsInfinite(FXMVECTOR V); + +XMVECTOR XMVector3Dot(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVector3Cross(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVector3LengthSq(FXMVECTOR V); +XMVECTOR XMVector3ReciprocalLengthEst(FXMVECTOR V); +XMVECTOR XMVector3ReciprocalLength(FXMVECTOR V); +XMVECTOR XMVector3LengthEst(FXMVECTOR V); +XMVECTOR XMVector3Length(FXMVECTOR V); +XMVECTOR XMVector3NormalizeEst(FXMVECTOR V); +XMVECTOR XMVector3Normalize(FXMVECTOR V); +XMVECTOR XMVector3ClampLength(FXMVECTOR V, float LengthMin, float LengthMax); +XMVECTOR XMVector3ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax); +XMVECTOR XMVector3Reflect(FXMVECTOR Incident, FXMVECTOR Normal); +XMVECTOR XMVector3Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex); +XMVECTOR XMVector3RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex); +XMVECTOR XMVector3Orthogonal(FXMVECTOR V); +XMVECTOR XMVector3AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2); +XMVECTOR XMVector3AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2); +XMVECTOR XMVector3AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVector3LinePointDistance(FXMVECTOR LinePoint1, FXMVECTOR LinePoint2, FXMVECTOR Point); +void XMVector3ComponentsFromNormal(_Out_ XMVECTOR* pParallel, _Out_ XMVECTOR* pPerpendicular, _In_ FXMVECTOR V, _In_ FXMVECTOR Normal); +XMVECTOR XMVector3Rotate(FXMVECTOR V, FXMVECTOR RotationQuaternion); +XMVECTOR XMVector3InverseRotate(FXMVECTOR V, FXMVECTOR RotationQuaternion); +XMVECTOR XMVector3Transform(FXMVECTOR V, CXMMATRIX M); +XMFLOAT4* XMVector3TransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4)+OutputStride*(VectorCount-1)) XMFLOAT4* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) const XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ CXMMATRIX M); +XMVECTOR XMVector3TransformCoord(FXMVECTOR V, CXMMATRIX M); +XMFLOAT3* XMVector3TransformCoordStream(_Out_writes_bytes_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) const XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ CXMMATRIX M); +XMVECTOR XMVector3TransformNormal(FXMVECTOR V, CXMMATRIX M); +XMFLOAT3* XMVector3TransformNormalStream(_Out_writes_bytes_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) const XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ CXMMATRIX M); +XMVECTOR XMVector3Project(FXMVECTOR V, float ViewportX, float ViewportY, float ViewportWidth, float ViewportHeight, float ViewportMinZ, float ViewportMaxZ, + CXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World); +XMFLOAT3* XMVector3ProjectStream(_Out_writes_bytes_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) const XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, + _In_ float ViewportX, _In_ float ViewportY, _In_ float ViewportWidth, _In_ float ViewportHeight, _In_ float ViewportMinZ, _In_ float ViewportMaxZ, + _In_ CXMMATRIX Projection, _In_ CXMMATRIX View, _In_ CXMMATRIX World); +XMVECTOR XMVector3Unproject(FXMVECTOR V, float ViewportX, float ViewportY, float ViewportWidth, float ViewportHeight, float ViewportMinZ, float ViewportMaxZ, + CXMMATRIX Projection, CXMMATRIX View, CXMMATRIX World); +XMFLOAT3* XMVector3UnprojectStream(_Out_writes_bytes_(sizeof(XMFLOAT3)+OutputStride*(VectorCount-1)) XMFLOAT3* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT3)+InputStride*(VectorCount-1)) const XMFLOAT3* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, + _In_ float ViewportX, _In_ float ViewportY, _In_ float ViewportWidth, _In_ float ViewportHeight, _In_ float ViewportMinZ, _In_ float ViewportMaxZ, + _In_ CXMMATRIX Projection, _In_ CXMMATRIX View, _In_ CXMMATRIX World); + +/**************************************************************************** + * + * 4D vector operations + * + ****************************************************************************/ + +bool XMVector4Equal(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XMVector4EqualR(FXMVECTOR V1, FXMVECTOR V2); +bool XMVector4EqualInt(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XMVector4EqualIntR(FXMVECTOR V1, FXMVECTOR V2); +bool XMVector4NearEqual(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR Epsilon); +bool XMVector4NotEqual(FXMVECTOR V1, FXMVECTOR V2); +bool XMVector4NotEqualInt(FXMVECTOR V1, FXMVECTOR V2); +bool XMVector4Greater(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XMVector4GreaterR(FXMVECTOR V1, FXMVECTOR V2); +bool XMVector4GreaterOrEqual(FXMVECTOR V1, FXMVECTOR V2); +uint32_t XMVector4GreaterOrEqualR(FXMVECTOR V1, FXMVECTOR V2); +bool XMVector4Less(FXMVECTOR V1, FXMVECTOR V2); +bool XMVector4LessOrEqual(FXMVECTOR V1, FXMVECTOR V2); +bool XMVector4InBounds(FXMVECTOR V, FXMVECTOR Bounds); + +bool XMVector4IsNaN(FXMVECTOR V); +bool XMVector4IsInfinite(FXMVECTOR V); + +XMVECTOR XMVector4Dot(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVector4Cross(FXMVECTOR V1, FXMVECTOR V2, FXMVECTOR V3); +XMVECTOR XMVector4LengthSq(FXMVECTOR V); +XMVECTOR XMVector4ReciprocalLengthEst(FXMVECTOR V); +XMVECTOR XMVector4ReciprocalLength(FXMVECTOR V); +XMVECTOR XMVector4LengthEst(FXMVECTOR V); +XMVECTOR XMVector4Length(FXMVECTOR V); +XMVECTOR XMVector4NormalizeEst(FXMVECTOR V); +XMVECTOR XMVector4Normalize(FXMVECTOR V); +XMVECTOR XMVector4ClampLength(FXMVECTOR V, float LengthMin, float LengthMax); +XMVECTOR XMVector4ClampLengthV(FXMVECTOR V, FXMVECTOR LengthMin, FXMVECTOR LengthMax); +XMVECTOR XMVector4Reflect(FXMVECTOR Incident, FXMVECTOR Normal); +XMVECTOR XMVector4Refract(FXMVECTOR Incident, FXMVECTOR Normal, float RefractionIndex); +XMVECTOR XMVector4RefractV(FXMVECTOR Incident, FXMVECTOR Normal, FXMVECTOR RefractionIndex); +XMVECTOR XMVector4Orthogonal(FXMVECTOR V); +XMVECTOR XMVector4AngleBetweenNormalsEst(FXMVECTOR N1, FXMVECTOR N2); +XMVECTOR XMVector4AngleBetweenNormals(FXMVECTOR N1, FXMVECTOR N2); +XMVECTOR XMVector4AngleBetweenVectors(FXMVECTOR V1, FXMVECTOR V2); +XMVECTOR XMVector4Transform(FXMVECTOR V, CXMMATRIX M); +XMFLOAT4* XMVector4TransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4)+OutputStride*(VectorCount-1)) XMFLOAT4* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT4)+InputStride*(VectorCount-1)) const XMFLOAT4* pInputStream, + _In_ size_t InputStride, _In_ size_t VectorCount, _In_ CXMMATRIX M); + +/**************************************************************************** + * + * Matrix operations + * + ****************************************************************************/ + +bool XMMatrixIsNaN(CXMMATRIX M); +bool XMMatrixIsInfinite(CXMMATRIX M); +bool XMMatrixIsIdentity(CXMMATRIX M); + +XMMATRIX XMMatrixMultiply(CXMMATRIX M1, CXMMATRIX M2); +XMMATRIX XMMatrixMultiplyTranspose(CXMMATRIX M1, CXMMATRIX M2); +XMMATRIX XMMatrixTranspose(CXMMATRIX M); +XMMATRIX XMMatrixInverse(_Out_opt_ XMVECTOR* pDeterminant, _In_ CXMMATRIX M); +XMVECTOR XMMatrixDeterminant(CXMMATRIX M); +_Success_(return) +bool XMMatrixDecompose(_Out_ XMVECTOR *outScale, _Out_ XMVECTOR *outRotQuat, _Out_ XMVECTOR *outTrans, _In_ CXMMATRIX M); + +XMMATRIX XMMatrixIdentity(); +XMMATRIX XMMatrixSet(float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33); +XMMATRIX XMMatrixTranslation(float OffsetX, float OffsetY, float OffsetZ); +XMMATRIX XMMatrixTranslationFromVector(FXMVECTOR Offset); +XMMATRIX XMMatrixScaling(float ScaleX, float ScaleY, float ScaleZ); +XMMATRIX XMMatrixScalingFromVector(FXMVECTOR Scale); +XMMATRIX XMMatrixRotationX(float Angle); +XMMATRIX XMMatrixRotationY(float Angle); +XMMATRIX XMMatrixRotationZ(float Angle); +XMMATRIX XMMatrixRotationRollPitchYaw(float Pitch, float Yaw, float Roll); +XMMATRIX XMMatrixRotationRollPitchYawFromVector(FXMVECTOR Angles); +XMMATRIX XMMatrixRotationNormal(FXMVECTOR NormalAxis, float Angle); +XMMATRIX XMMatrixRotationAxis(FXMVECTOR Axis, float Angle); +XMMATRIX XMMatrixRotationQuaternion(FXMVECTOR Quaternion); +XMMATRIX XMMatrixTransformation2D(FXMVECTOR ScalingOrigin, float ScalingOrientation, FXMVECTOR Scaling, + FXMVECTOR RotationOrigin, float Rotation, GXMVECTOR Translation); +XMMATRIX XMMatrixTransformation(FXMVECTOR ScalingOrigin, FXMVECTOR ScalingOrientationQuaternion, FXMVECTOR Scaling, + GXMVECTOR RotationOrigin, CXMVECTOR RotationQuaternion, CXMVECTOR Translation); +XMMATRIX XMMatrixAffineTransformation2D(FXMVECTOR Scaling, FXMVECTOR RotationOrigin, float Rotation, FXMVECTOR Translation); +XMMATRIX XMMatrixAffineTransformation(FXMVECTOR Scaling, FXMVECTOR RotationOrigin, FXMVECTOR RotationQuaternion, GXMVECTOR Translation); +XMMATRIX XMMatrixReflect(FXMVECTOR ReflectionPlane); +XMMATRIX XMMatrixShadow(FXMVECTOR ShadowPlane, FXMVECTOR LightPosition); + +XMMATRIX XMMatrixLookAtLH(FXMVECTOR EyePosition, FXMVECTOR FocusPosition, FXMVECTOR UpDirection); +XMMATRIX XMMatrixLookAtRH(FXMVECTOR EyePosition, FXMVECTOR FocusPosition, FXMVECTOR UpDirection); +XMMATRIX XMMatrixLookToLH(FXMVECTOR EyePosition, FXMVECTOR EyeDirection, FXMVECTOR UpDirection); +XMMATRIX XMMatrixLookToRH(FXMVECTOR EyePosition, FXMVECTOR EyeDirection, FXMVECTOR UpDirection); +XMMATRIX XMMatrixPerspectiveLH(float ViewWidth, float ViewHeight, float NearZ, float FarZ); +XMMATRIX XMMatrixPerspectiveRH(float ViewWidth, float ViewHeight, float NearZ, float FarZ); +XMMATRIX XMMatrixPerspectiveFovLH(float FovAngleY, float AspectHByW, float NearZ, float FarZ); +XMMATRIX XMMatrixPerspectiveFovRH(float FovAngleY, float AspectHByW, float NearZ, float FarZ); +XMMATRIX XMMatrixPerspectiveOffCenterLH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ); +XMMATRIX XMMatrixPerspectiveOffCenterRH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ); +XMMATRIX XMMatrixOrthographicLH(float ViewWidth, float ViewHeight, float NearZ, float FarZ); +XMMATRIX XMMatrixOrthographicRH(float ViewWidth, float ViewHeight, float NearZ, float FarZ); +XMMATRIX XMMatrixOrthographicOffCenterLH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ); +XMMATRIX XMMatrixOrthographicOffCenterRH(float ViewLeft, float ViewRight, float ViewBottom, float ViewTop, float NearZ, float FarZ); + + +/**************************************************************************** + * + * Quaternion operations + * + ****************************************************************************/ + +bool XMQuaternionEqual(FXMVECTOR Q1, FXMVECTOR Q2); +bool XMQuaternionNotEqual(FXMVECTOR Q1, FXMVECTOR Q2); + +bool XMQuaternionIsNaN(FXMVECTOR Q); +bool XMQuaternionIsInfinite(FXMVECTOR Q); +bool XMQuaternionIsIdentity(FXMVECTOR Q); + +XMVECTOR XMQuaternionDot(FXMVECTOR Q1, FXMVECTOR Q2); +XMVECTOR XMQuaternionMultiply(FXMVECTOR Q1, FXMVECTOR Q2); +XMVECTOR XMQuaternionLengthSq(FXMVECTOR Q); +XMVECTOR XMQuaternionReciprocalLength(FXMVECTOR Q); +XMVECTOR XMQuaternionLength(FXMVECTOR Q); +XMVECTOR XMQuaternionNormalizeEst(FXMVECTOR Q); +XMVECTOR XMQuaternionNormalize(FXMVECTOR Q); +XMVECTOR XMQuaternionConjugate(FXMVECTOR Q); +XMVECTOR XMQuaternionInverse(FXMVECTOR Q); +XMVECTOR XMQuaternionLn(FXMVECTOR Q); +XMVECTOR XMQuaternionExp(FXMVECTOR Q); +XMVECTOR XMQuaternionSlerp(FXMVECTOR Q0, FXMVECTOR Q1, float t); +XMVECTOR XMQuaternionSlerpV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR T); +XMVECTOR XMQuaternionSquad(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR Q3, float t); +XMVECTOR XMQuaternionSquadV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR Q3, CXMVECTOR T); +void XMQuaternionSquadSetup(_Out_ XMVECTOR* pA, _Out_ XMVECTOR* pB, _Out_ XMVECTOR* pC, _In_ FXMVECTOR Q0, _In_ FXMVECTOR Q1, _In_ FXMVECTOR Q2, _In_ GXMVECTOR Q3); +XMVECTOR XMQuaternionBaryCentric(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, float f, float g); +XMVECTOR XMQuaternionBaryCentricV(FXMVECTOR Q0, FXMVECTOR Q1, FXMVECTOR Q2, GXMVECTOR F, CXMVECTOR G); + +XMVECTOR XMQuaternionIdentity(); +XMVECTOR XMQuaternionRotationRollPitchYaw(float Pitch, float Yaw, float Roll); +XMVECTOR XMQuaternionRotationRollPitchYawFromVector(FXMVECTOR Angles); +XMVECTOR XMQuaternionRotationNormal(FXMVECTOR NormalAxis, float Angle); +XMVECTOR XMQuaternionRotationAxis(FXMVECTOR Axis, float Angle); +XMVECTOR XMQuaternionRotationMatrix(CXMMATRIX M); + +void XMQuaternionToAxisAngle(_Out_ XMVECTOR* pAxis, _Out_ float* pAngle, _In_ FXMVECTOR Q); + +/**************************************************************************** + * + * Plane operations + * + ****************************************************************************/ + +bool XMPlaneEqual(FXMVECTOR P1, FXMVECTOR P2); +bool XMPlaneNearEqual(FXMVECTOR P1, FXMVECTOR P2, FXMVECTOR Epsilon); +bool XMPlaneNotEqual(FXMVECTOR P1, FXMVECTOR P2); + +bool XMPlaneIsNaN(FXMVECTOR P); +bool XMPlaneIsInfinite(FXMVECTOR P); + +XMVECTOR XMPlaneDot(FXMVECTOR P, FXMVECTOR V); +XMVECTOR XMPlaneDotCoord(FXMVECTOR P, FXMVECTOR V); +XMVECTOR XMPlaneDotNormal(FXMVECTOR P, FXMVECTOR V); +XMVECTOR XMPlaneNormalizeEst(FXMVECTOR P); +XMVECTOR XMPlaneNormalize(FXMVECTOR P); +XMVECTOR XMPlaneIntersectLine(FXMVECTOR P, FXMVECTOR LinePoint1, FXMVECTOR LinePoint2); +void XMPlaneIntersectPlane(_Out_ XMVECTOR* pLinePoint1, _Out_ XMVECTOR* pLinePoint2, _In_ FXMVECTOR P1, _In_ FXMVECTOR P2); +XMVECTOR XMPlaneTransform(FXMVECTOR P, CXMMATRIX M); +XMFLOAT4* XMPlaneTransformStream(_Out_writes_bytes_(sizeof(XMFLOAT4)+OutputStride*(PlaneCount-1)) XMFLOAT4* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(XMFLOAT4)+InputStride*(PlaneCount-1)) const XMFLOAT4* pInputStream, + _In_ size_t InputStride, _In_ size_t PlaneCount, _In_ CXMMATRIX M); + +XMVECTOR XMPlaneFromPointNormal(FXMVECTOR Point, FXMVECTOR Normal); +XMVECTOR XMPlaneFromPoints(FXMVECTOR Point1, FXMVECTOR Point2, FXMVECTOR Point3); + +/**************************************************************************** + * + * Color operations + * + ****************************************************************************/ + +bool XMColorEqual(FXMVECTOR C1, FXMVECTOR C2); +bool XMColorNotEqual(FXMVECTOR C1, FXMVECTOR C2); +bool XMColorGreater(FXMVECTOR C1, FXMVECTOR C2); +bool XMColorGreaterOrEqual(FXMVECTOR C1, FXMVECTOR C2); +bool XMColorLess(FXMVECTOR C1, FXMVECTOR C2); +bool XMColorLessOrEqual(FXMVECTOR C1, FXMVECTOR C2); + +bool XMColorIsNaN(FXMVECTOR C); +bool XMColorIsInfinite(FXMVECTOR C); + +XMVECTOR XMColorNegative(FXMVECTOR C); +XMVECTOR XMColorModulate(FXMVECTOR C1, FXMVECTOR C2); +XMVECTOR XMColorAdjustSaturation(FXMVECTOR C, float Saturation); +XMVECTOR XMColorAdjustContrast(FXMVECTOR C, float Contrast); + +XMVECTOR XMColorRGBToHSL( FXMVECTOR rgb ); +XMVECTOR XMColorHSLToRGB( FXMVECTOR hsl ); + +XMVECTOR XMColorRGBToHSV( FXMVECTOR rgb ); +XMVECTOR XMColorHSVToRGB( FXMVECTOR hsv ); + +XMVECTOR XMColorRGBToYUV( FXMVECTOR rgb ); +XMVECTOR XMColorYUVToRGB( FXMVECTOR yuv ); + +XMVECTOR XMColorRGBToYUV_HD( FXMVECTOR rgb ); +XMVECTOR XMColorYUVToRGB_HD( FXMVECTOR yuv ); + +XMVECTOR XMColorRGBToXYZ( FXMVECTOR rgb ); +XMVECTOR XMColorXYZToRGB( FXMVECTOR xyz ); + +XMVECTOR XMColorXYZToSRGB( FXMVECTOR xyz ); +XMVECTOR XMColorSRGBToXYZ( FXMVECTOR srgb ); + +/**************************************************************************** + * + * Miscellaneous operations + * + ****************************************************************************/ + +bool XMVerifyCPUSupport(); + +XMVECTOR XMFresnelTerm(FXMVECTOR CosIncidentAngle, FXMVECTOR RefractionIndex); + +bool XMScalarNearEqual(float S1, float S2, float Epsilon); +float XMScalarModAngle(float Value); + +float XMScalarSin(float Value); +float XMScalarSinEst(float Value); + +float XMScalarCos(float Value); +float XMScalarCosEst(float Value); + +void XMScalarSinCos(_Out_ float* pSin, _Out_ float* pCos, float Value); +void XMScalarSinCosEst(_Out_ float* pSin, _Out_ float* pCos, float Value); + +float XMScalarASin(float Value); +float XMScalarASinEst(float Value); + +float XMScalarACos(float Value); +float XMScalarACosEst(float Value); + +/**************************************************************************** + * + * Templates + * + ****************************************************************************/ + +#if defined(__XNAMATH_H__) && defined(XMMin) +#undef XMMin +#undef XMMax +#endif + +template<class T> inline T XMMin(T a, T b) { return (a < b) ? a : b; } +template<class T> inline T XMMax(T a, T b) { return (a > b) ? a : b; } + +//------------------------------------------------------------------------------ + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + +#define XM_PERMUTE_PS( v, c ) _mm_shuffle_ps( v, v, c ) + +// PermuteHelper internal template (SSE only) +namespace Internal +{ + // Slow path fallback for permutes that do not map to a single SSE shuffle opcode. + template<uint32_t Shuffle, bool WhichX, bool WhichY, bool WhichZ, bool WhichW> struct PermuteHelper + { + static XMVECTOR Permute(FXMVECTOR v1, FXMVECTOR v2) + { + static const XMVECTORU32 selectMask = + { + WhichX ? 0xFFFFFFFF : 0, + WhichY ? 0xFFFFFFFF : 0, + WhichZ ? 0xFFFFFFFF : 0, + WhichW ? 0xFFFFFFFF : 0, + }; + + XMVECTOR shuffled1 = XM_PERMUTE_PS(v1, Shuffle); + XMVECTOR shuffled2 = XM_PERMUTE_PS(v2, Shuffle); + + XMVECTOR masked1 = _mm_andnot_ps(selectMask, shuffled1); + XMVECTOR masked2 = _mm_and_ps(selectMask, shuffled2); + + return _mm_or_ps(masked1, masked2); + } + }; + + // Fast path for permutes that only read from the first vector. + template<uint32_t Shuffle> struct PermuteHelper<Shuffle, false, false, false, false> + { + static XMVECTOR Permute(FXMVECTOR v1, FXMVECTOR v2) { (v2); return XM_PERMUTE_PS(v1, Shuffle); } + }; + + // Fast path for permutes that only read from the second vector. + template<uint32_t Shuffle> struct PermuteHelper<Shuffle, true, true, true, true> + { + static XMVECTOR Permute(FXMVECTOR v1, FXMVECTOR v2){ (v1); return XM_PERMUTE_PS(v2, Shuffle); } + }; + + // Fast path for permutes that read XY from the first vector, ZW from the second. + template<uint32_t Shuffle> struct PermuteHelper<Shuffle, false, false, true, true> + { + static XMVECTOR Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v1, v2, Shuffle); } + }; + + // Fast path for permutes that read XY from the second vector, ZW from the first. + template<uint32_t Shuffle> struct PermuteHelper<Shuffle, true, true, false, false> + { + static XMVECTOR Permute(FXMVECTOR v1, FXMVECTOR v2) { return _mm_shuffle_ps(v2, v1, Shuffle); } + }; +}; + +#endif // _XM_SSE_INTRINSICS_ && !_XM_NO_INTRINSICS_ + +// General permute template +template<uint32_t PermuteX, uint32_t PermuteY, uint32_t PermuteZ, uint32_t PermuteW> + inline XMVECTOR XMVectorPermute(FXMVECTOR V1, FXMVECTOR V2) +{ + static_assert(PermuteX <= 7, "PermuteX template parameter out of range"); + static_assert(PermuteY <= 7, "PermuteY template parameter out of range"); + static_assert(PermuteZ <= 7, "PermuteZ template parameter out of range"); + static_assert(PermuteW <= 7, "PermuteW template parameter out of range"); + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + const uint32_t Shuffle = _MM_SHUFFLE(PermuteW & 3, PermuteZ & 3, PermuteY & 3, PermuteX & 3); + + const bool WhichX = PermuteX > 3; + const bool WhichY = PermuteY > 3; + const bool WhichZ = PermuteZ > 3; + const bool WhichW = PermuteW > 3; + + return Internal::PermuteHelper<Shuffle, WhichX, WhichY, WhichZ, WhichW>::Permute(V1, V2); +#else + + return XMVectorPermute( V1, V2, PermuteX, PermuteY, PermuteZ, PermuteW ); + +#endif +} + +// Special-case permute templates +template<> inline XMVECTOR XMVectorPermute<0,1,2,3>(FXMVECTOR V1, FXMVECTOR V2) { (V2); return V1; } +template<> inline XMVECTOR XMVectorPermute<4,5,6,7>(FXMVECTOR V1, FXMVECTOR V2) { (V1); return V2; } + +#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + +// If the indices are all in the range 0-3 or 4-7, then use XMVectorSwizzle instead +// The mirror cases are not spelled out here as the programmer can always swap the arguments +// (i.e. prefer permutes where the X element comes from the V1 vector instead of the V2 vector) + +template<> inline XMVECTOR XMVectorPermute<0,1,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_low_f32(V1), vget_low_f32(V2) ); } +template<> inline XMVECTOR XMVectorPermute<1,0,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_low_f32(V1) ), vget_low_f32(V2) ); } +template<> inline XMVECTOR XMVectorPermute<0,1,5,4>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_low_f32(V1), vrev64_f32( vget_low_f32(V2) ) ); } +template<> inline XMVECTOR XMVectorPermute<1,0,5,4>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_low_f32(V1) ), vrev64_f32( vget_low_f32(V2) ) ); } + +template<> inline XMVECTOR XMVectorPermute<2,3,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_high_f32(V1), vget_high_f32(V2) ); } +template<> inline XMVECTOR XMVectorPermute<3,2,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_high_f32(V1) ), vget_high_f32(V2) ); } +template<> inline XMVECTOR XMVectorPermute<2,3,7,6>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_high_f32(V1), vrev64_f32( vget_high_f32(V2) ) ); } +template<> inline XMVECTOR XMVectorPermute<3,2,7,6>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_high_f32(V1) ), vrev64_f32( vget_high_f32(V2) ) ); } + +template<> inline XMVECTOR XMVectorPermute<0,1,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_low_f32(V1), vget_high_f32(V2) ); } +template<> inline XMVECTOR XMVectorPermute<1,0,6,7>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_low_f32(V1) ), vget_high_f32(V2) ); } +template<> inline XMVECTOR XMVectorPermute<0,1,7,6>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_low_f32(V1), vrev64_f32( vget_high_f32(V2) ) ); } +template<> inline XMVECTOR XMVectorPermute<1,0,7,6>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_low_f32(V1) ), vrev64_f32( vget_high_f32(V2) ) ); } + +template<> inline XMVECTOR XMVectorPermute<3,2,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_high_f32(V1) ), vget_low_f32(V2) ); } +template<> inline XMVECTOR XMVectorPermute<2,3,5,4>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vget_high_f32(V1), vrev64_f32( vget_low_f32(V2) ) ); } +template<> inline XMVECTOR XMVectorPermute<3,2,5,4>(FXMVECTOR V1, FXMVECTOR V2) { return vcombine_f32( vrev64_f32( vget_high_f32(V1) ), vrev64_f32( vget_low_f32(V2) ) ); } + +template<> inline XMVECTOR XMVectorPermute<0,4,2,6>(FXMVECTOR V1, FXMVECTOR V2) { return vtrnq_f32(V1,V2).val[0]; } +template<> inline XMVECTOR XMVectorPermute<1,5,3,7>(FXMVECTOR V1, FXMVECTOR V2) { return vtrnq_f32(V1,V2).val[1]; } + +template<> inline XMVECTOR XMVectorPermute<0,4,1,5>(FXMVECTOR V1, FXMVECTOR V2) { return vzipq_f32(V1,V2).val[0]; } +template<> inline XMVECTOR XMVectorPermute<2,6,3,7>(FXMVECTOR V1, FXMVECTOR V2) { return vzipq_f32(V1,V2).val[1]; } + +template<> inline XMVECTOR XMVectorPermute<0,2,4,6>(FXMVECTOR V1, FXMVECTOR V2) { return vuzpq_f32(V1,V2).val[0]; } +template<> inline XMVECTOR XMVectorPermute<1,3,5,7>(FXMVECTOR V1, FXMVECTOR V2) { return vuzpq_f32(V1,V2).val[1]; } + +template<> inline XMVECTOR XMVectorPermute<1,2,3,4>(FXMVECTOR V1, FXMVECTOR V2) { return vextq_f32(V1, V2, 1); } +template<> inline XMVECTOR XMVectorPermute<2,3,4,5>(FXMVECTOR V1, FXMVECTOR V2) { return vextq_f32(V1, V2, 2); } +template<> inline XMVECTOR XMVectorPermute<3,4,5,6>(FXMVECTOR V1, FXMVECTOR V2) { return vextq_f32(V1, V2, 3); } + +#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_ + +//------------------------------------------------------------------------------ + +// General swizzle template +template<uint32_t SwizzleX, uint32_t SwizzleY, uint32_t SwizzleZ, uint32_t SwizzleW> + inline XMVECTOR XMVectorSwizzle(FXMVECTOR V) +{ + static_assert(SwizzleX <= 3, "SwizzleX template parameter out of range"); + static_assert(SwizzleY <= 3, "SwizzleY template parameter out of range"); + static_assert(SwizzleZ <= 3, "SwizzleZ template parameter out of range"); + static_assert(SwizzleW <= 3, "SwizzleW template parameter out of range"); + +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + return XM_PERMUTE_PS( V, _MM_SHUFFLE( SwizzleW, SwizzleZ, SwizzleY, SwizzleX ) ); +#elif defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + return __vpermwi(V, ((SwizzleX & 3) << 6) | ((SwizzleY & 3) << 4) | ((SwizzleZ & 3) << 2) | (SwizzleW & 3) ); +#else + + return XMVectorSwizzle( V, SwizzleX, SwizzleY, SwizzleZ, SwizzleW ); + +#endif +} + +// Specialized swizzles +template<> inline XMVECTOR XMVectorSwizzle<0,1,2,3>(FXMVECTOR V) { return V; } + + +#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + +template<> inline XMVECTOR XMVectorSwizzle<0,0,0,0>(FXMVECTOR V) { return vdupq_lane_f32( vget_low_f32(V), 0); } +template<> inline XMVECTOR XMVectorSwizzle<1,1,1,1>(FXMVECTOR V) { return vdupq_lane_f32( vget_low_f32(V), 1); } +template<> inline XMVECTOR XMVectorSwizzle<2,2,2,2>(FXMVECTOR V) { return vdupq_lane_f32( vget_high_f32(V), 0); } +template<> inline XMVECTOR XMVectorSwizzle<3,3,3,3>(FXMVECTOR V) { return vdupq_lane_f32( vget_high_f32(V), 1); } + +template<> inline XMVECTOR XMVectorSwizzle<1,0,3,2>(FXMVECTOR V) { return vrev64q_f32(V); } + +template<> inline XMVECTOR XMVectorSwizzle<0,1,0,1>(FXMVECTOR V) { __n64 vt = vget_low_f32(V); return vcombine_f32( vt, vt ); } +template<> inline XMVECTOR XMVectorSwizzle<2,3,2,3>(FXMVECTOR V) { __n64 vt = vget_high_f32(V); return vcombine_f32( vt, vt ); } +template<> inline XMVECTOR XMVectorSwizzle<1,0,1,0>(FXMVECTOR V) { __n64 vt = vrev64_f32( vget_low_f32(V) ); return vcombine_f32( vt, vt ); } +template<> inline XMVECTOR XMVectorSwizzle<3,2,3,2>(FXMVECTOR V) { __n64 vt = vrev64_f32( vget_high_f32(V) ); return vcombine_f32( vt, vt ); } + +template<> inline XMVECTOR XMVectorSwizzle<0,1,3,2>(FXMVECTOR V) { return vcombine_f32( vget_low_f32(V), vrev64_f32( vget_high_f32(V) ) ); } +template<> inline XMVECTOR XMVectorSwizzle<1,0,2,3>(FXMVECTOR V) { return vcombine_f32( vrev64_f32( vget_low_f32(V) ), vget_high_f32(V) ); } +template<> inline XMVECTOR XMVectorSwizzle<2,3,1,0>(FXMVECTOR V) { return vcombine_f32( vget_high_f32(V), vrev64_f32( vget_low_f32(V) ) ); } +template<> inline XMVECTOR XMVectorSwizzle<3,2,0,1>(FXMVECTOR V) { return vcombine_f32( vrev64_f32( vget_high_f32(V) ), vget_low_f32(V) ); } +template<> inline XMVECTOR XMVectorSwizzle<3,2,1,0>(FXMVECTOR V) { return vcombine_f32( vrev64_f32( vget_high_f32(V) ), vrev64_f32( vget_low_f32(V) ) ); } + +template<> inline XMVECTOR XMVectorSwizzle<0,0,2,2>(FXMVECTOR V) { return vtrnq_f32(V,V).val[0]; } +template<> inline XMVECTOR XMVectorSwizzle<1,1,3,3>(FXMVECTOR V) { return vtrnq_f32(V,V).val[1]; } + +template<> inline XMVECTOR XMVectorSwizzle<0,0,1,1>(FXMVECTOR V) { return vzipq_f32(V,V).val[0]; } +template<> inline XMVECTOR XMVectorSwizzle<2,2,3,3>(FXMVECTOR V) { return vzipq_f32(V,V).val[1]; } + +template<> inline XMVECTOR XMVectorSwizzle<0,2,0,2>(FXMVECTOR V) { return vuzpq_f32(V,V).val[0]; } +template<> inline XMVECTOR XMVectorSwizzle<1,3,1,3>(FXMVECTOR V) { return vuzpq_f32(V,V).val[1]; } + +template<> inline XMVECTOR XMVectorSwizzle<1,2,3,0>(FXMVECTOR V) { return vextq_f32(V, V, 1); } +template<> inline XMVECTOR XMVectorSwizzle<2,3,0,1>(FXMVECTOR V) { return vextq_f32(V, V, 2); } +template<> inline XMVECTOR XMVectorSwizzle<3,0,1,2>(FXMVECTOR V) { return vextq_f32(V, V, 3); } + +#endif // _XM_ARM_NEON_INTRINSICS_ && !_XM_NO_INTRINSICS_ + +//------------------------------------------------------------------------------ + +template<uint32_t Elements> + inline XMVECTOR XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2) +{ + static_assert( Elements < 4, "Elements template parameter out of range" ); +#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +#else + return XMVectorPermute<Elements, (Elements + 1), (Elements + 2), (Elements + 3)>(V1, V2); +#endif +} + +template<uint32_t Elements> + inline XMVECTOR XMVectorRotateLeft(FXMVECTOR V) +{ + static_assert( Elements < 4, "Elements template parameter out of range" ); +#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +#else + return XMVectorSwizzle<Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3>(V); +#endif +} + +template<uint32_t Elements> + inline XMVECTOR XMVectorRotateRight(FXMVECTOR V) +{ + static_assert( Elements < 4, "Elements template parameter out of range" ); +#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +#else + return XMVectorSwizzle<(4 - Elements) & 3, (5 - Elements) & 3, (6 - Elements) & 3, (7 - Elements) & 3>(V); +#endif +} + +template<uint32_t VSLeftRotateElements, uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3> + inline XMVECTOR XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS) +{ +#if defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +#else + XMVECTOR Control = XMVectorSelectControl(Select0&1, Select1&1, Select2&1, Select3&1); + return XMVectorSelect( VD, XMVectorRotateLeft<VSLeftRotateElements>(VS), Control ); +#endif +} + +/**************************************************************************** + * + * Globals + * + ****************************************************************************/ + +// The purpose of the following global constants is to prevent redundant +// reloading of the constants when they are referenced by more than one +// separate inline math routine called within the same function. Declaring +// a constant locally within a routine is sufficient to prevent redundant +// reloads of that constant when that single routine is called multiple +// times in a function, but if the constant is used (and declared) in a +// separate math routine it would be reloaded. + +#ifndef XMGLOBALCONST +#define XMGLOBALCONST static const // extern const // MGH - __declspec(selectany) +#endif + +XMGLOBALCONST XMVECTORF32 g_XMSinCoefficients0 = {-0.16666667f, +0.0083333310f, -0.00019840874f, +2.7525562e-06f}; +XMGLOBALCONST XMVECTORF32 g_XMSinCoefficients1 = {-2.3889859e-08f, -0.16665852f /*Est1*/, +0.0083139502f /*Est2*/, -0.00018524670f /*Est3*/}; +XMGLOBALCONST XMVECTORF32 g_XMCosCoefficients0 = {-0.5f, +0.041666638f, -0.0013888378f, +2.4760495e-05f}; +XMGLOBALCONST XMVECTORF32 g_XMCosCoefficients1 = {-2.6051615e-07f, -0.49992746f /*Est1*/, +0.041493919f /*Est2*/, -0.0012712436f /*Est3*/}; +XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients0 = {1.0f, 0.333333333f, 0.133333333f, 5.396825397e-2f}; +XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients1 = {2.186948854e-2f, 8.863235530e-3f, 3.592128167e-3f, 1.455834485e-3f}; +XMGLOBALCONST XMVECTORF32 g_XMTanCoefficients2 = {5.900274264e-4f, 2.391290764e-4f, 9.691537707e-5f, 3.927832950e-5f}; +XMGLOBALCONST XMVECTORF32 g_XMArcCoefficients0 = {+1.5707963050f, -0.2145988016f, +0.0889789874f, -0.0501743046f}; +XMGLOBALCONST XMVECTORF32 g_XMArcCoefficients1 = {+0.0308918810f, -0.0170881256f, +0.0066700901f, -0.0012624911f}; +XMGLOBALCONST XMVECTORF32 g_XMATanCoefficients0 = {-0.3333314528f, +0.1999355085f, -0.1420889944f, +0.1065626393f}; +XMGLOBALCONST XMVECTORF32 g_XMATanCoefficients1 = {-0.0752896400f, +0.0429096138f, -0.0161657367f, +0.0028662257f}; +XMGLOBALCONST XMVECTORF32 g_XMATanEstCoefficients0 = {+0.999866f, +0.999866f, +0.999866f, +0.999866f}; +XMGLOBALCONST XMVECTORF32 g_XMATanEstCoefficients1 = {-0.3302995f, +0.180141f, -0.085133f, +0.0208351f}; +XMGLOBALCONST XMVECTORF32 g_XMTanEstCoefficients = {2.484f, -1.954923183e-1f, 2.467401101f, XM_1DIVPI}; +XMGLOBALCONST XMVECTORF32 g_XMArcEstCoefficients = {+1.5707288f,-0.2121144f,+0.0742610f,-0.0187293f}; +XMGLOBALCONST XMVECTORF32 g_XMPiConstants0 = {XM_PI, XM_2PI, XM_1DIVPI, XM_1DIV2PI}; +XMGLOBALCONST XMVECTORF32 g_XMIdentityR0 = {1.0f, 0.0f, 0.0f, 0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMIdentityR1 = {0.0f, 1.0f, 0.0f, 0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMIdentityR2 = {0.0f, 0.0f, 1.0f, 0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMIdentityR3 = {0.0f, 0.0f, 0.0f, 1.0f}; +XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR0 = {-1.0f,0.0f, 0.0f, 0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR1 = {0.0f,-1.0f, 0.0f, 0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR2 = {0.0f, 0.0f,-1.0f, 0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMNegIdentityR3 = {0.0f, 0.0f, 0.0f,-1.0f}; +XMGLOBALCONST XMVECTORI32 g_XMNegativeZero = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; +XMGLOBALCONST XMVECTORI32 g_XMNegate3 = {0x80000000, 0x80000000, 0x80000000, 0x00000000}; +XMGLOBALCONST XMVECTORI32 g_XMMask3 = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000}; +XMGLOBALCONST XMVECTORI32 g_XMMaskX = {0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000}; +XMGLOBALCONST XMVECTORI32 g_XMMaskY = {0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000}; +XMGLOBALCONST XMVECTORI32 g_XMMaskZ = {0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000}; +XMGLOBALCONST XMVECTORI32 g_XMMaskW = {0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF}; +XMGLOBALCONST XMVECTORF32 g_XMOne = { 1.0f, 1.0f, 1.0f, 1.0f}; +XMGLOBALCONST XMVECTORF32 g_XMOne3 = { 1.0f, 1.0f, 1.0f, 0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMZero = { 0.0f, 0.0f, 0.0f, 0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMTwo = { 2.f, 2.f, 2.f, 2.f }; +XMGLOBALCONST XMVECTORF32 g_XMFour = { 4.f, 4.f, 4.f, 4.f }; +XMGLOBALCONST XMVECTORF32 g_XMSix = { 6.f, 6.f, 6.f, 6.f }; +XMGLOBALCONST XMVECTORF32 g_XMNegativeOne = {-1.0f,-1.0f,-1.0f,-1.0f}; +XMGLOBALCONST XMVECTORF32 g_XMOneHalf = { 0.5f, 0.5f, 0.5f, 0.5f}; +XMGLOBALCONST XMVECTORF32 g_XMNegativeOneHalf = {-0.5f,-0.5f,-0.5f,-0.5f}; +XMGLOBALCONST XMVECTORF32 g_XMNegativeTwoPi = {-XM_2PI, -XM_2PI, -XM_2PI, -XM_2PI}; +XMGLOBALCONST XMVECTORF32 g_XMNegativePi = {-XM_PI, -XM_PI, -XM_PI, -XM_PI}; +XMGLOBALCONST XMVECTORF32 g_XMHalfPi = {XM_PIDIV2, XM_PIDIV2, XM_PIDIV2, XM_PIDIV2}; +XMGLOBALCONST XMVECTORF32 g_XMPi = {XM_PI, XM_PI, XM_PI, XM_PI}; +XMGLOBALCONST XMVECTORF32 g_XMReciprocalPi = {XM_1DIVPI, XM_1DIVPI, XM_1DIVPI, XM_1DIVPI}; +XMGLOBALCONST XMVECTORF32 g_XMTwoPi = {XM_2PI, XM_2PI, XM_2PI, XM_2PI}; +XMGLOBALCONST XMVECTORF32 g_XMReciprocalTwoPi = {XM_1DIV2PI, XM_1DIV2PI, XM_1DIV2PI, XM_1DIV2PI}; +XMGLOBALCONST XMVECTORF32 g_XMEpsilon = {1.192092896e-7f, 1.192092896e-7f, 1.192092896e-7f, 1.192092896e-7f}; +XMGLOBALCONST XMVECTORI32 g_XMInfinity = {0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000}; +XMGLOBALCONST XMVECTORI32 g_XMQNaN = {0x7FC00000, 0x7FC00000, 0x7FC00000, 0x7FC00000}; +XMGLOBALCONST XMVECTORI32 g_XMQNaNTest = {0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF}; +XMGLOBALCONST XMVECTORI32 g_XMAbsMask = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF}; +XMGLOBALCONST XMVECTORI32 g_XMFltMin = {0x00800000, 0x00800000, 0x00800000, 0x00800000}; +XMGLOBALCONST XMVECTORI32 g_XMFltMax = {0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF, 0x7F7FFFFF}; +XMGLOBALCONST XMVECTORI32 g_XMNegOneMask = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF}; +XMGLOBALCONST XMVECTORI32 g_XMMaskA8R8G8B8 = {0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000}; +XMGLOBALCONST XMVECTORI32 g_XMFlipA8R8G8B8 = {0x00000000, 0x00000000, 0x00000000, 0x80000000}; +XMGLOBALCONST XMVECTORF32 g_XMFixAA8R8G8B8 = {0.0f,0.0f,0.0f,(float)(0x80000000U)}; +XMGLOBALCONST XMVECTORF32 g_XMNormalizeA8R8G8B8 = {1.0f/(255.0f*(float)(0x10000)),1.0f/(255.0f*(float)(0x100)),1.0f/255.0f,1.0f/(255.0f*(float)(0x1000000))}; +XMGLOBALCONST XMVECTORI32 g_XMMaskA2B10G10R10 = {0x000003FF, 0x000FFC00, 0x3FF00000, 0xC0000000}; +XMGLOBALCONST XMVECTORI32 g_XMFlipA2B10G10R10 = {0x00000200, 0x00080000, 0x20000000, 0x80000000}; +XMGLOBALCONST XMVECTORF32 g_XMFixAA2B10G10R10 = {-512.0f,-512.0f*(float)(0x400),-512.0f*(float)(0x100000),(float)(0x80000000U)}; +XMGLOBALCONST XMVECTORF32 g_XMNormalizeA2B10G10R10 = {1.0f/511.0f,1.0f/(511.0f*(float)(0x400)),1.0f/(511.0f*(float)(0x100000)),1.0f/(3.0f*(float)(0x40000000))}; +XMGLOBALCONST XMVECTORI32 g_XMMaskX16Y16 = {0x0000FFFF, 0xFFFF0000, 0x00000000, 0x00000000}; +XMGLOBALCONST XMVECTORI32 g_XMFlipX16Y16 = {0x00008000, 0x00000000, 0x00000000, 0x00000000}; +XMGLOBALCONST XMVECTORF32 g_XMFixX16Y16 = {-32768.0f,0.0f,0.0f,0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMNormalizeX16Y16 = {1.0f/32767.0f,1.0f/(32767.0f*65536.0f),0.0f,0.0f}; +XMGLOBALCONST XMVECTORI32 g_XMMaskX16Y16Z16W16 = {0x0000FFFF, 0x0000FFFF, 0xFFFF0000, 0xFFFF0000}; +XMGLOBALCONST XMVECTORI32 g_XMFlipX16Y16Z16W16 = {0x00008000, 0x00008000, 0x00000000, 0x00000000}; +XMGLOBALCONST XMVECTORF32 g_XMFixX16Y16Z16W16 = {-32768.0f,-32768.0f,0.0f,0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMNormalizeX16Y16Z16W16 = {1.0f/32767.0f,1.0f/32767.0f,1.0f/(32767.0f*65536.0f),1.0f/(32767.0f*65536.0f)}; +XMGLOBALCONST XMVECTORF32 g_XMNoFraction = {8388608.0f,8388608.0f,8388608.0f,8388608.0f}; +XMGLOBALCONST XMVECTORI32 g_XMMaskByte = {0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF}; +XMGLOBALCONST XMVECTORF32 g_XMNegateX = {-1.0f, 1.0f, 1.0f, 1.0f}; +XMGLOBALCONST XMVECTORF32 g_XMNegateY = { 1.0f,-1.0f, 1.0f, 1.0f}; +XMGLOBALCONST XMVECTORF32 g_XMNegateZ = { 1.0f, 1.0f,-1.0f, 1.0f}; +XMGLOBALCONST XMVECTORF32 g_XMNegateW = { 1.0f, 1.0f, 1.0f,-1.0f}; +XMGLOBALCONST XMVECTORI32 g_XMSelect0101 = {XM_SELECT_0, XM_SELECT_1, XM_SELECT_0, XM_SELECT_1}; +XMGLOBALCONST XMVECTORI32 g_XMSelect1010 = {XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0}; +XMGLOBALCONST XMVECTORI32 g_XMOneHalfMinusEpsilon = { 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD, 0x3EFFFFFD}; +XMGLOBALCONST XMVECTORI32 g_XMSelect1000 = {XM_SELECT_1, XM_SELECT_0, XM_SELECT_0, XM_SELECT_0}; +XMGLOBALCONST XMVECTORI32 g_XMSelect1100 = {XM_SELECT_1, XM_SELECT_1, XM_SELECT_0, XM_SELECT_0}; +XMGLOBALCONST XMVECTORI32 g_XMSelect1110 = {XM_SELECT_1, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0}; +XMGLOBALCONST XMVECTORI32 g_XMSelect1011 = { XM_SELECT_1, XM_SELECT_0, XM_SELECT_1, XM_SELECT_1 }; +XMGLOBALCONST XMVECTORF32 g_XMFixupY16 = {1.0f,1.0f/65536.0f,0.0f,0.0f}; +XMGLOBALCONST XMVECTORF32 g_XMFixupY16W16 = {1.0f,1.0f,1.0f/65536.0f,1.0f/65536.0f}; +XMGLOBALCONST XMVECTORI32 g_XMFlipY = {0,0x80000000,0,0}; +XMGLOBALCONST XMVECTORI32 g_XMFlipZ = {0,0,0x80000000,0}; +XMGLOBALCONST XMVECTORI32 g_XMFlipW = {0,0,0,0x80000000}; +XMGLOBALCONST XMVECTORI32 g_XMFlipYZ = {0,0x80000000,0x80000000,0}; +XMGLOBALCONST XMVECTORI32 g_XMFlipZW = {0,0,0x80000000,0x80000000}; +XMGLOBALCONST XMVECTORI32 g_XMFlipYW = {0,0x80000000,0,0x80000000}; +XMGLOBALCONST XMVECTORI32 g_XMMaskDec4 = {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<30}; +XMGLOBALCONST XMVECTORI32 g_XMXorDec4 = {0x200,0x200<<10,0x200<<20,0}; +XMGLOBALCONST XMVECTORF32 g_XMAddUDec4 = {0,0,0,32768.0f*65536.0f}; +XMGLOBALCONST XMVECTORF32 g_XMAddDec4 = {-512.0f,-512.0f*1024.0f,-512.0f*1024.0f*1024.0f,0}; +XMGLOBALCONST XMVECTORF32 g_XMMulDec4 = {1.0f,1.0f/1024.0f,1.0f/(1024.0f*1024.0f),1.0f/(1024.0f*1024.0f*1024.0f)}; +XMGLOBALCONST XMVECTORI32 g_XMMaskByte4 = {0xFF,0xFF00,0xFF0000,0xFF000000}; +XMGLOBALCONST XMVECTORI32 g_XMXorByte4 = {0x80,0x8000,0x800000,0x00000000}; +XMGLOBALCONST XMVECTORF32 g_XMAddByte4 = {-128.0f,-128.0f*256.0f,-128.0f*65536.0f,0}; +XMGLOBALCONST XMVECTORF32 g_XMFixUnsigned = {32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f}; +XMGLOBALCONST XMVECTORF32 g_XMMaxInt = {65536.0f*32768.0f-128.0f,65536.0f*32768.0f-128.0f,65536.0f*32768.0f-128.0f,65536.0f*32768.0f-128.0f}; +XMGLOBALCONST XMVECTORF32 g_XMMaxUInt = {65536.0f*65536.0f-256.0f,65536.0f*65536.0f-256.0f,65536.0f*65536.0f-256.0f,65536.0f*65536.0f-256.0f}; +XMGLOBALCONST XMVECTORF32 g_XMUnsignedFix = {32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f}; +XMGLOBALCONST XMVECTORF32 g_XMsrgbScale = { 12.92f, 12.92f, 12.92f, 1.0f }; +XMGLOBALCONST XMVECTORF32 g_XMsrgbA = { 0.055f, 0.055f, 0.055f, 0.0f }; +XMGLOBALCONST XMVECTORF32 g_XMsrgbA1 = { 1.055f, 1.055f, 1.055f, 1.0f }; + +/**************************************************************************** + * + * Implementation + * + ****************************************************************************/ + +#pragma warning(push) +#pragma warning(disable:4068 4214 4204 4365 4616 4640 6001) + +#pragma prefast(push) +#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes") + +//------------------------------------------------------------------------------ + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + +inline XMVECTOR XMVectorSetBinaryConstant(uint32_t C0, uint32_t C1, uint32_t C2, uint32_t C3) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = (0-(C0&1)) & 0x3F800000; + vResult.u[1] = (0-(C1&1)) & 0x3F800000; + vResult.u[2] = (0-(C2&1)) & 0x3F800000; + vResult.u[3] = (0-(C3&1)) & 0x3F800000; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTORU32 vResult; + vResult.u[0] = (0-(C0&1)) & 0x3F800000; + vResult.u[1] = (0-(C1&1)) & 0x3F800000; + vResult.u[2] = (0-(C2&1)) & 0x3F800000; + vResult.u[3] = (0-(C3&1)) & 0x3F800000; + return vResult.v; +#else // XM_SSE_INTRINSICS_ + static const XMVECTORU32 g_vMask1 = {1,1,1,1}; + // Move the parms to a vector + __m128i vTemp = _mm_set_epi32(C3,C2,C1,C0); + // Mask off the low bits + vTemp = _mm_and_si128(vTemp,g_vMask1); + // 0xFFFFFFFF on true bits + vTemp = _mm_cmpeq_epi32(vTemp,g_vMask1); + // 0xFFFFFFFF -> 1.0f, 0x00000000 -> 0.0f + vTemp = _mm_and_si128(vTemp,g_XMOne); + return _mm_castsi128_ps(vTemp); +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorSplatConstant(int32_t IntConstant, uint32_t DivExponent) +{ + assert( IntConstant >= -16 && IntConstant <= 15 ); + assert( DivExponent < 32 ); +#if defined(_XM_NO_INTRINSICS_) + + using DirectX::XMConvertVectorIntToFloat; + + XMVECTORI32 V = { IntConstant, IntConstant, IntConstant, IntConstant }; + return XMConvertVectorIntToFloat( V.v, DivExponent); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Splat the int + int32x4_t vScale = vdupq_n_s32(IntConstant); + // Convert to a float + XMVECTOR vResult = vcvtq_f32_s32(vScale); + // Convert DivExponent into 1.0f/(1<<DivExponent) + uint32_t uScale = 0x3F800000U - (DivExponent << 23); + // Splat the scalar value (It's really a float) + vScale = vdupq_n_s32(uScale); + // Multiply by the reciprocal (Perform a right shift by DivExponent) + vResult = vmulq_f32(vResult,reinterpret_cast<const float32x4_t *>(&vScale)[0]); + return vResult; +#else // XM_SSE_INTRINSICS_ + // Splat the int + __m128i vScale = _mm_set1_epi32(IntConstant); + // Convert to a float + XMVECTOR vResult = _mm_cvtepi32_ps(vScale); + // Convert DivExponent into 1.0f/(1<<DivExponent) + uint32_t uScale = 0x3F800000U - (DivExponent << 23); + // Splat the scalar value (It's really a float) + vScale = _mm_set1_epi32(uScale); + // Multiply by the reciprocal (Perform a right shift by DivExponent) + vResult = _mm_mul_ps(vResult,_mm_castsi128_ps(vScale)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorSplatConstantInt(int32_t IntConstant) +{ + assert( IntConstant >= -16 && IntConstant <= 15 ); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORI32 V = { IntConstant, IntConstant, IntConstant, IntConstant }; + return V.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + int32x4_t V = vdupq_n_s32( IntConstant ); + return reinterpret_cast<float32x4_t *>(&V)[0]; +#else // XM_SSE_INTRINSICS_ + __m128i V = _mm_set1_epi32( IntConstant ); + return reinterpret_cast<__m128 *>(&V)[0]; +#endif +} + +// Implemented for VMX128 intrinsics as #defines aboves +#endif // _XM_NO_INTRINSICS_ || _XM_SSE_INTRINSICS_ || _XM_ARM_NEON_INTRINSICS_ + +#include "DirectXMathConvert.inl" +#include "DirectXMathVector.inl" +#include "DirectXMathMatrix.inl" +#include "DirectXMathMisc.inl" + + +#pragma prefast(pop) +#pragma warning(pop) + +}; // namespace DirectX + diff --git a/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXMathConvert.inl b/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXMathConvert.inl new file mode 100644 index 00000000..c8e39352 --- /dev/null +++ b/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXMathConvert.inl @@ -0,0 +1,1962 @@ +//------------------------------------------------------------------------------------- +// DirectXMathConvert.inl -- SIMD C++ Math library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------------------------------- + +#ifdef _MSC_VER +#pragma once +#endif + +/**************************************************************************** + * + * Data conversion + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) +// For VMX128, these routines are all defines in the main header + +#pragma warning(push) +#pragma warning(disable:4701) // Prevent warnings about 'Result' potentially being used without having been initialized + +inline XMVECTOR XMConvertVectorIntToFloat +( + FXMVECTOR VInt, + uint32_t DivExponent +) +{ + assert(DivExponent<32); +#if defined(_XM_NO_INTRINSICS_) + float fScale = 1.0f / (float)(1U << DivExponent); + uint32_t ElementIndex = 0; + XMVECTOR Result; + do { + int32_t iTemp = (int32_t)VInt.vector4_u32[ElementIndex]; + Result.vector4_f32[ElementIndex] = ((float)iTemp) * fScale; + } while (++ElementIndex<4); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vcvtq_f32_s32( VInt ); + uint32_t uScale = 0x3F800000U - (DivExponent << 23); + __n128 vScale = vdupq_n_u32( uScale ); + return vmulq_f32( vResult, vScale ); +#else // _XM_SSE_INTRINSICS_ + // Convert to floats + XMVECTOR vResult = _mm_cvtepi32_ps(_mm_castps_si128(VInt)); + // Convert DivExponent into 1.0f/(1<<DivExponent) + uint32_t uScale = 0x3F800000U - (DivExponent << 23); + // Splat the scalar value + __m128i vScale = _mm_set1_epi32(uScale); + vResult = _mm_mul_ps(vResult,_mm_castsi128_ps(vScale)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMConvertVectorFloatToInt +( + FXMVECTOR VFloat, + uint32_t MulExponent +) +{ + assert(MulExponent<32); +#if defined(_XM_NO_INTRINSICS_) + // Get the scalar factor. + float fScale = (float)(1U << MulExponent); + uint32_t ElementIndex = 0; + XMVECTOR Result; + do { + int32_t iResult; + float fTemp = VFloat.vector4_f32[ElementIndex]*fScale; + if (fTemp <= -(65536.0f*32768.0f)) { + iResult = (-0x7FFFFFFF)-1; + } else if (fTemp > (65536.0f*32768.0f)-128.0f) { + iResult = 0x7FFFFFFF; + } else { + iResult = (int32_t)fTemp; + } + Result.vector4_u32[ElementIndex] = (uint32_t)iResult; + } while (++ElementIndex<4); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vdupq_n_f32((float)(1U << MulExponent)); + vResult = vmulq_f32(vResult,VFloat); + // In case of positive overflow, detect it + __n128 vOverflow = vcgtq_f32(vResult,g_XMMaxInt); + // Float to int conversion + __n128 vResulti = vcvtq_s32_f32(vResult); + // If there was positive overflow, set to 0x7FFFFFFF + vResult = vandq_u32(vOverflow,g_XMAbsMask); + vOverflow = vbicq_u32(vResulti,vOverflow); + vOverflow = vorrq_u32(vOverflow,vResult); + return vOverflow; +#else // _XM_SSE_INTRINSICS_ + XMVECTOR vResult = _mm_set_ps1((float)(1U << MulExponent)); + vResult = _mm_mul_ps(vResult,VFloat); + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(vResult); + // If there was positive overflow, set to 0x7FFFFFFF + vResult = _mm_and_ps(vOverflow,g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti)); + vOverflow = _mm_or_ps(vOverflow,vResult); + return vOverflow; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMConvertVectorUIntToFloat +( + FXMVECTOR VUInt, + uint32_t DivExponent +) +{ + assert(DivExponent<32); +#if defined(_XM_NO_INTRINSICS_) + float fScale = 1.0f / (float)(1U << DivExponent); + uint32_t ElementIndex = 0; + XMVECTOR Result; + do { + Result.vector4_f32[ElementIndex] = (float)VUInt.vector4_u32[ElementIndex] * fScale; + } while (++ElementIndex<4); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vcvtq_f32_u32( VUInt ); + uint32_t uScale = 0x3F800000U - (DivExponent << 23); + __n128 vScale = vdupq_n_u32( uScale ); + return vmulq_f32( vResult, vScale ); +#else // _XM_SSE_INTRINSICS_ + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(VUInt,g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(VUInt,vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned); + vResult = _mm_add_ps(vResult,vMask); + // Convert DivExponent into 1.0f/(1<<DivExponent) + uint32_t uScale = 0x3F800000U - (DivExponent << 23); + // Splat + iMask = _mm_set1_epi32(uScale); + vResult = _mm_mul_ps(vResult,_mm_castsi128_ps(iMask)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMConvertVectorFloatToUInt +( + FXMVECTOR VFloat, + uint32_t MulExponent +) +{ + assert(MulExponent<32); +#if defined(_XM_NO_INTRINSICS_) + // Get the scalar factor. + float fScale = (float)(1U << MulExponent); + uint32_t ElementIndex = 0; + XMVECTOR Result; + do { + uint32_t uResult; + float fTemp = VFloat.vector4_f32[ElementIndex]*fScale; + if (fTemp <= 0.0f) { + uResult = 0; + } else if (fTemp >= (65536.0f*65536.0f)) { + uResult = 0xFFFFFFFFU; + } else { + uResult = (uint32_t)fTemp; + } + Result.vector4_u32[ElementIndex] = uResult; + } while (++ElementIndex<4); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vdupq_n_f32((float)(1U << MulExponent)); + vResult = vmulq_f32(vResult,VFloat); + // In case of overflow, detect it + __n128 vOverflow = vcgtq_f32(vResult,g_XMMaxUInt); + // Float to int conversion + __n128 vResulti = vcvtq_u32_f32(vResult); + // If there was overflow, set to 0xFFFFFFFFU + vResult = vbicq_u32(vResulti,vOverflow); + vOverflow = vorrq_u32(vOverflow,vResult); + return vOverflow; +#else // _XM_SSE_INTRINSICS_ + XMVECTOR vResult = _mm_set_ps1(static_cast<float>(1U << MulExponent)); + vResult = _mm_mul_ps(vResult,VFloat); + // Clamp to >=0 + vResult = _mm_max_ps(vResult,g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue,vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult,vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask,g_XMNegativeZero); + vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult,vOverflow); + return vResult; +#endif +} + +#pragma warning(pop) + +#endif // _XM_NO_INTRINSICS_ || _XM_SSE_INTRINSICS_ || _XM_ARM_NEON_INTRINSICS_ + +/**************************************************************************** + * + * Vector and matrix load operations + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadInt(const uint32_t* pSource) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = *pSource; + V.vector4_u32[1] = 0; + V.vector4_u32[2] = 0; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 zero = vdupq_n_u32(0); + return vld1q_lane_u32( pSource, zero, 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ss( reinterpret_cast<const float*>(pSource) ); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadFloat(const float* pSource) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = *pSource; + V.vector4_f32[1] = 0.f; + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 zero = vdupq_n_u32(0); + return vld1q_lane_f32( pSource, zero, 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ss( pSource ); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadInt2 +( + const uint32_t* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = 0; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 x = vld1_u32( pSource ); + __n64 zero = vdup_n_u32(0); + return vcombine_u32( x, zero ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pSource) ); + __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pSource+1) ); + return _mm_unpacklo_ps( x, y ); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadInt2A +( + const uint32_t* pSource +) +{ + assert(pSource); + assert(((uintptr_t)pSource & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = 0; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 x = vld1_u32_ex( pSource, 64 ); + __n64 zero = vdup_n_u32(0); + return vcombine_u32( x, zero ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) ); + return reinterpret_cast<__m128 *>(&V)[0]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadFloat2 +( + const XMFLOAT2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 x = vld1_f32( reinterpret_cast<const float*>(pSource) ); + __n64 zero = vdup_n_u32(0); + return vcombine_f32( x, zero ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( &pSource->x ); + __m128 y = _mm_load_ss( &pSource->y ); + return _mm_unpacklo_ps( x, y ); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadFloat2A +( + const XMFLOAT2A* pSource +) +{ + assert(pSource); + assert(((uintptr_t)pSource & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 x = vld1_f32_ex( reinterpret_cast<const float*>(pSource), 64 ); + __n64 zero = vdup_n_u32(0); + return vcombine_f32( x, zero ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) ); + return reinterpret_cast<__m128 *>(&V)[0]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadSInt2 +( + const XMINT2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = (float)pSource->x; + V.vector4_f32[1] = (float)pSource->y; + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 x = vld1_s32( reinterpret_cast<const int32_t*>(pSource) ); + __n64 v = vcvt_f32_s32( x ); + __n64 zero = vdup_n_u32(0); + return vcombine_s32( v, zero ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) ); + __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) ); + __m128 V = _mm_unpacklo_ps( x, y ); + return _mm_cvtepi32_ps(_mm_castps_si128(V)); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadUInt2 +( + const XMUINT2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = (float)pSource->x; + V.vector4_f32[1] = (float)pSource->y; + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 x = vld1_u32( reinterpret_cast<const uint32_t*>(pSource) ); + __n64 v = vcvt_f32_u32( x ); + __n64 zero = vdup_n_u32(0); + return vcombine_u32( v, zero ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) ); + __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) ); + __m128 V = _mm_unpacklo_ps( x, y ); + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(V,vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned); + vResult = _mm_add_ps(vResult,vMask); + return vResult; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadInt3 +( + const uint32_t* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 x = vld1_u32( pSource ); + __n64 zero = vdup_n_u32(0); + __n64 y = vld1_lane_u32( pSource+2, zero, 0 ); + return vcombine_u32( x, y ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pSource) ); + __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pSource+1) ); + __m128 z = _mm_load_ss( reinterpret_cast<const float*>(pSource+2) ); + __m128 xy = _mm_unpacklo_ps( x, y ); + return _mm_movelh_ps( xy, z ); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadInt3A +( + const uint32_t* pSource +) +{ + assert(pSource); + assert(((uintptr_t)pSource & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Reads an extra integer which is zero'd + __n128 V = vld1q_u32_ex( pSource, 128 ); + return vsetq_lane_u32( 0, V, 3 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Reads an extra integer which is zero'd + __m128i V = _mm_load_si128( reinterpret_cast<const __m128i*>(pSource) ); + V = _mm_and_si128( V, g_XMMask3 ); + return reinterpret_cast<__m128 *>(&V)[0]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadFloat3 +( + const XMFLOAT3* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = pSource->z; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 x = vld1_f32( reinterpret_cast<const float*>(pSource) ); + __n64 zero = vdup_n_u32(0); + __n64 y = vld1_lane_f32( reinterpret_cast<const float*>(pSource)+2, zero, 0 ); + return vcombine_f32( x, y ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( &pSource->x ); + __m128 y = _mm_load_ss( &pSource->y ); + __m128 z = _mm_load_ss( &pSource->z ); + __m128 xy = _mm_unpacklo_ps( x, y ); + return _mm_movelh_ps( xy, z ); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadFloat3A +( + const XMFLOAT3A* pSource +) +{ + assert(pSource); + assert(((uintptr_t)pSource & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = pSource->z; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Reads an extra float which is zero'd + __n128 V = vld1q_f32_ex( reinterpret_cast<const float*>(pSource), 128 ); + return vsetq_lane_f32( 0, V, 3 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Reads an extra float which is zero'd + __m128 V = _mm_load_ps( &pSource->x ); + return _mm_and_ps( V, g_XMMask3 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadSInt3 +( + const XMINT3* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + V.vector4_f32[0] = (float)pSource->x; + V.vector4_f32[1] = (float)pSource->y; + V.vector4_f32[2] = (float)pSource->z; + V.vector4_f32[3] = 0.f; + return V; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 x = vld1_s32( reinterpret_cast<const int32_t*>(pSource) ); + __n64 zero = vdup_n_u32(0); + __n64 y = vld1_lane_s32( reinterpret_cast<const int32_t*>(pSource)+2, zero, 0 ); + __n128 v = vcombine_s32( x, y ); + return vcvtq_f32_s32( v ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) ); + __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) ); + __m128 z = _mm_load_ss( reinterpret_cast<const float*>(&pSource->z) ); + __m128 xy = _mm_unpacklo_ps( x, y ); + __m128 V = _mm_movelh_ps( xy, z ); + return _mm_cvtepi32_ps(_mm_castps_si128(V)); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadUInt3 +( + const XMUINT3* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = (float)pSource->x; + V.vector4_f32[1] = (float)pSource->y; + V.vector4_f32[2] = (float)pSource->z; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 x = vld1_u32( reinterpret_cast<const uint32_t*>(pSource) ); + __n64 zero = vdup_n_u32(0); + __n64 y = vld1_lane_u32( reinterpret_cast<const uint32_t*>(pSource)+2, zero, 0 ); + __n128 v = vcombine_u32( x, y ); + return vcvtq_f32_u32( v ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) ); + __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) ); + __m128 z = _mm_load_ss( reinterpret_cast<const float*>(&pSource->z) ); + __m128 xy = _mm_unpacklo_ps( x, y ); + __m128 V = _mm_movelh_ps( xy, z ); + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(V,vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned); + vResult = _mm_add_ps(vResult,vMask); + return vResult; + +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadInt4 +( + const uint32_t* pSource +) +{ + assert(pSource); + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + V.vector4_u32[3] = pSource[3]; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_u32( pSource ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_loadu_si128( reinterpret_cast<const __m128i*>(pSource) ); + return reinterpret_cast<__m128 *>(&V)[0]; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadInt4A +( + const uint32_t* pSource +) +{ + assert(pSource); + assert(((uintptr_t)pSource & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + V.vector4_u32[3] = pSource[3]; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_u32_ex( pSource, 128 ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_load_si128( reinterpret_cast<const __m128i*>(pSource) ); + return reinterpret_cast<__m128 *>(&V)[0]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadFloat4 +( + const XMFLOAT4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = pSource->z; + V.vector4_f32[3] = pSource->w; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_f32( reinterpret_cast<const float*>(pSource) ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_loadu_ps( &pSource->x ); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadFloat4A +( + const XMFLOAT4A* pSource +) +{ + assert(pSource); + assert(((uintptr_t)pSource & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = pSource->z; + V.vector4_f32[3] = pSource->w; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_f32_ex( reinterpret_cast<const float*>(pSource), 128 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ps( &pSource->x ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadSInt4 +( + const XMINT4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + V.vector4_f32[0] = (float)pSource->x; + V.vector4_f32[1] = (float)pSource->y; + V.vector4_f32[2] = (float)pSource->z; + V.vector4_f32[3] = (float)pSource->w; + return V; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 v = vld1q_s32( reinterpret_cast<const int32_t*>(pSource) ); + return vcvtq_f32_s32( v ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_loadu_si128( reinterpret_cast<const __m128i*>(pSource) ); + return _mm_cvtepi32_ps(V); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadUInt4 +( + const XMUINT4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = (float)pSource->x; + V.vector4_f32[1] = (float)pSource->y; + V.vector4_f32[2] = (float)pSource->z; + V.vector4_f32[3] = (float)pSource->w; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 v = vld1q_u32( reinterpret_cast<const uint32_t*>(pSource) ); + return vcvtq_f32_u32( v ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_loadu_si128( reinterpret_cast<const __m128i*>(pSource) ); + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(_mm_castsi128_ps(V),g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(_mm_castsi128_ps(V),vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned); + vResult = _mm_add_ps(vResult,vMask); + return vResult; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XMLoadFloat3x3 +( + const XMFLOAT3X3* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + M.r[3].vector4_f32[0] = 0.0f; + M.r[3].vector4_f32[1] = 0.0f; + M.r[3].vector4_f32[2] = 0.0f; + M.r[3].vector4_f32[3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 v0 = vld1q_f32( &pSource->m[0][0] ); + __n128 v1 = vld1q_f32( &pSource->m[1][1] ); + __n64 v2 = vcreate_f32( (uint64_t)*(const uint32_t*)&pSource->m[2][2] ); + __n128 T = vextq_f32( v0, v1, 3 ); + + XMMATRIX M; + M.r[0] = vandq_u32( v0, g_XMMask3 ); + M.r[1] = vandq_u32( T, g_XMMask3 ); + M.r[2] = vcombine_f32( vget_high_f32(v1), v2 ); + M.r[3] = g_XMIdentityR3; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 Z = _mm_setzero_ps(); + + __m128 V1 = _mm_loadu_ps( &pSource->m[0][0] ); + __m128 V2 = _mm_loadu_ps( &pSource->m[1][1] ); + __m128 V3 = _mm_load_ss( &pSource->m[2][2] ); + + __m128 T1 = _mm_unpackhi_ps( V1, Z ); + __m128 T2 = _mm_unpacklo_ps( V2, Z ); + __m128 T3 = _mm_shuffle_ps( V3, T2, _MM_SHUFFLE( 0, 1, 0, 0 ) ); + __m128 T4 = _mm_movehl_ps( T2, T3 ); + __m128 T5 = _mm_movehl_ps( Z, T1 ); + + XMMATRIX M; + M.r[0] = _mm_movelh_ps( V1, T1 ); + M.r[1] = _mm_add_ps( T4, T5 ); + M.r[2] = _mm_shuffle_ps( V2, V3, _MM_SHUFFLE(1, 0, 3, 2) ); + M.r[3] = g_XMIdentityR3; + return M; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XMLoadFloat4x3 +( + const XMFLOAT4X3* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + + M.r[3].vector4_f32[0] = pSource->m[3][0]; + M.r[3].vector4_f32[1] = pSource->m[3][1]; + M.r[3].vector4_f32[2] = pSource->m[3][2]; + M.r[3].vector4_f32[3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 v0 = vld1q_f32( &pSource->m[0][0] ); + __n128 v1 = vld1q_f32( &pSource->m[1][1] ); + __n128 v2 = vld1q_f32( &pSource->m[2][2] ); + + __n128 T1 = vextq_f32( v0, v1, 3 ); + __n128 T2 = vcombine_f32( vget_high_f32(v1), vget_low_f32(v2) ); + __n128 T3 = vextq_f32( v2, v2, 1 ); + + XMMATRIX M; + M.r[0] = vandq_u32( v0, g_XMMask3 ); + M.r[1] = vandq_u32( T1, g_XMMask3 ); + M.r[2] = vandq_u32( T2, g_XMMask3 ); + M.r[3] = vsetq_lane_f32( 1.f, T3, 3 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + // Use unaligned load instructions to + // load the 12 floats + // vTemp1 = x1,y1,z1,x2 + XMVECTOR vTemp1 = _mm_loadu_ps(&pSource->m[0][0]); + // vTemp2 = y2,z2,x3,y3 + XMVECTOR vTemp2 = _mm_loadu_ps(&pSource->m[1][1]); + // vTemp4 = z3,x4,y4,z4 + XMVECTOR vTemp4 = _mm_loadu_ps(&pSource->m[2][2]); + // vTemp3 = x3,y3,z3,z3 + XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2)); + // vTemp2 = y2,z2,x2,x2 + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0)); + // vTemp2 = x2,y2,z2,z2 + vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(1,1,0,2)); + // vTemp1 = x1,y1,z1,0 + vTemp1 = _mm_and_ps(vTemp1,g_XMMask3); + // vTemp2 = x2,y2,z2,0 + vTemp2 = _mm_and_ps(vTemp2,g_XMMask3); + // vTemp3 = x3,y3,z3,0 + vTemp3 = _mm_and_ps(vTemp3,g_XMMask3); + // vTemp4i = x4,y4,z4,0 + __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4),32/8); + // vTemp4i = x4,y4,z4,1.0f + vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3); + XMMATRIX M(vTemp1, + vTemp2, + vTemp3, + _mm_castsi128_ps(vTemp4i)); + return M; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XMLoadFloat4x3A +( + const XMFLOAT4X3A* pSource +) +{ + assert(pSource); + assert(((uintptr_t)pSource & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + + M.r[3].vector4_f32[0] = pSource->m[3][0]; + M.r[3].vector4_f32[1] = pSource->m[3][1]; + M.r[3].vector4_f32[2] = pSource->m[3][2]; + M.r[3].vector4_f32[3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 v0 = vld1q_f32_ex( &pSource->m[0][0], 128 ); + __n128 v1 = vld1q_f32_ex( &pSource->m[1][1], 128 ); + __n128 v2 = vld1q_f32_ex( &pSource->m[2][2], 128 ); + + __n128 T1 = vextq_f32( v0, v1, 3 ); + __n128 T2 = vcombine_f32( vget_high_f32(v1), vget_low_f32(v2) ); + __n128 T3 = vextq_f32( v2, v2, 1 ); + + XMMATRIX M; + M.r[0] = vandq_u32( v0, g_XMMask3 ); + M.r[1] = vandq_u32( T1, g_XMMask3 ); + M.r[2] = vandq_u32( T2, g_XMMask3 ); + M.r[3] = vsetq_lane_f32( 1.f, T3, 3 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + // Use aligned load instructions to + // load the 12 floats + // vTemp1 = x1,y1,z1,x2 + XMVECTOR vTemp1 = _mm_load_ps(&pSource->m[0][0]); + // vTemp2 = y2,z2,x3,y3 + XMVECTOR vTemp2 = _mm_load_ps(&pSource->m[1][1]); + // vTemp4 = z3,x4,y4,z4 + XMVECTOR vTemp4 = _mm_load_ps(&pSource->m[2][2]); + // vTemp3 = x3,y3,z3,z3 + XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2)); + // vTemp2 = y2,z2,x2,x2 + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0)); + // vTemp2 = x2,y2,z2,z2 + vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(1,1,0,2)); + // vTemp1 = x1,y1,z1,0 + vTemp1 = _mm_and_ps(vTemp1,g_XMMask3); + // vTemp2 = x2,y2,z2,0 + vTemp2 = _mm_and_ps(vTemp2,g_XMMask3); + // vTemp3 = x3,y3,z3,0 + vTemp3 = _mm_and_ps(vTemp3,g_XMMask3); + // vTemp4i = x4,y4,z4,0 + __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4),32/8); + // vTemp4i = x4,y4,z4,1.0f + vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3); + XMMATRIX M(vTemp1, + vTemp2, + vTemp3, + _mm_castsi128_ps(vTemp4i)); + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XMLoadFloat4x4 +( + const XMFLOAT4X4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = pSource->m[0][3]; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = pSource->m[1][3]; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = pSource->m[2][3]; + + M.r[3].vector4_f32[0] = pSource->m[3][0]; + M.r[3].vector4_f32[1] = pSource->m[3][1]; + M.r[3].vector4_f32[2] = pSource->m[3][2]; + M.r[3].vector4_f32[3] = pSource->m[3][3]; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; + M.r[0] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_11) ); + M.r[1] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_21) ); + M.r[2] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_31) ); + M.r[3] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_41) ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_loadu_ps( &pSource->_11 ); + M.r[1] = _mm_loadu_ps( &pSource->_21 ); + M.r[2] = _mm_loadu_ps( &pSource->_31 ); + M.r[3] = _mm_loadu_ps( &pSource->_41 ); + return M; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XMLoadFloat4x4A +( + const XMFLOAT4X4A* pSource +) +{ + assert(pSource); + assert(((uintptr_t)pSource & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = pSource->m[0][3]; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = pSource->m[1][3]; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = pSource->m[2][3]; + + M.r[3].vector4_f32[0] = pSource->m[3][0]; + M.r[3].vector4_f32[1] = pSource->m[3][1]; + M.r[3].vector4_f32[2] = pSource->m[3][2]; + M.r[3].vector4_f32[3] = pSource->m[3][3]; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; + M.r[0] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_11), 128 ); + M.r[1] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_21), 128 ); + M.r[2] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_31), 128 ); + M.r[3] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_41), 128 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_load_ps( &pSource->_11 ); + M.r[1] = _mm_load_ps( &pSource->_21 ); + M.r[2] = _mm_load_ps( &pSource->_31 ); + M.r[3] = _mm_load_ps( &pSource->_41 ); + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +/**************************************************************************** + * + * Vector and matrix store operations + * + ****************************************************************************/ +_Use_decl_annotations_ +inline void XMStoreInt +( + uint32_t* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + *pDestination = XMVectorGetIntX( V ); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32( pDestination, V, 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ss( reinterpret_cast<float*>(pDestination), V ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreFloat +( + float* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + *pDestination = XMVectorGetX( V ); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32( pDestination, V, 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ss( pDestination, V ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreInt2 +( + uint32_t* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 VL = vget_low_u32(V); + vst1_u32( pDestination, VL ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) ); + _mm_store_ss( reinterpret_cast<float*>(&pDestination[0]), V ); + _mm_store_ss( reinterpret_cast<float*>(&pDestination[1]), T ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreInt2A +( + uint32_t* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + assert(((uintptr_t)pDestination & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 VL = vget_low_u32(V); + vst1_u32_ex( pDestination, VL, 64 ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreFloat2 +( + XMFLOAT2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 VL = vget_low_f32(V); + vst1_f32( reinterpret_cast<float*>(pDestination), VL ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) ); + _mm_store_ss( &pDestination->x, V ); + _mm_store_ss( &pDestination->y, T ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreFloat2A +( + XMFLOAT2A* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + assert(((uintptr_t)pDestination & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 VL = vget_low_f32(V); + vst1_f32_ex( reinterpret_cast<float*>(pDestination), VL, 64 ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreSInt2 +( + XMINT2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = (int32_t)V.vector4_f32[0]; + pDestination->y = (int32_t)V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 v = vget_low_s32(V); + v = vcvt_s32_f32( v ); + vst1_s32( reinterpret_cast<int32_t*>(pDestination), v ); +#elif defined(_XM_SSE_INTRINSICS_) + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(V); + // If there was positive overflow, set to 0x7FFFFFFF + XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti)); + vOverflow = _mm_or_ps(vOverflow,vResult); + // Write two ints + XMVECTOR T = XM_PERMUTE_PS( vOverflow, _MM_SHUFFLE( 1, 1, 1, 1 ) ); + _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vOverflow ); + _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreUInt2 +( + XMUINT2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = (uint32_t)V.vector4_f32[0]; + pDestination->y = (uint32_t)V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 v = vget_low_u32(V); + v = vcvt_u32_f32( v ); + vst1_u32( reinterpret_cast<uint32_t*>(pDestination), v ); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to >=0 + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue,vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult,vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask,g_XMNegativeZero); + vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult,vOverflow); + // Write two uints + XMVECTOR T = XM_PERMUTE_PS( vResult, _MM_SHUFFLE( 1, 1, 1, 1 ) ); + _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vResult ); + _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreInt3 +( + uint32_t* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 VL = vget_low_u32(V); + vst1_u32( pDestination, VL ); + vst1q_lane_u32( pDestination+2, V, 2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss( reinterpret_cast<float*>(pDestination), V ); + _mm_store_ss( reinterpret_cast<float*>(&pDestination[1]), T1 ); + _mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), T2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreInt3A +( + uint32_t* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + assert(((uintptr_t)pDestination & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 VL = vget_low_u32(V); + vst1_u32_ex( pDestination, VL, 64 ); + vst1q_lane_u32( pDestination+2, V, 2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); + _mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), T ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreFloat3 +( + XMFLOAT3* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 VL = vget_low_f32(V); + vst1_f32( reinterpret_cast<float*>(pDestination), VL ); + vst1q_lane_f32( reinterpret_cast<float*>(pDestination)+2, V, 2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss( &pDestination->x, V ); + _mm_store_ss( &pDestination->y, T1 ); + _mm_store_ss( &pDestination->z, T2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreFloat3A +( + XMFLOAT3A* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + assert(((uintptr_t)pDestination & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 VL = vget_low_f32(V); + vst1_f32_ex( reinterpret_cast<float*>(pDestination), VL, 64 ); + vst1q_lane_f32( reinterpret_cast<float*>(pDestination)+2, V, 2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); + _mm_store_ss( &pDestination->z, T ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreSInt3 +( + XMINT3* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = (int32_t)V.vector4_f32[0]; + pDestination->y = (int32_t)V.vector4_f32[1]; + pDestination->z = (int32_t)V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 v = vcvtq_s32_f32(V); + __n64 vL = vget_low_s32(v); + vst1_s32( reinterpret_cast<int32_t*>(pDestination), vL ); + vst1q_lane_s32( reinterpret_cast<int32_t*>(pDestination)+2, v, 2 ); +#elif defined(_XM_SSE_INTRINSICS_) + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(V); + // If there was positive overflow, set to 0x7FFFFFFF + XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti)); + vOverflow = _mm_or_ps(vOverflow,vResult); + // Write 3 uints + XMVECTOR T1 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR T2 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vOverflow ); + _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T1 ); + _mm_store_ss( reinterpret_cast<float*>(&pDestination->z), T2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreUInt3 +( + XMUINT3* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = (uint32_t)V.vector4_f32[0]; + pDestination->y = (uint32_t)V.vector4_f32[1]; + pDestination->z = (uint32_t)V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 v = vcvtq_u32_f32(V); + __n64 vL = vget_low_u32(v); + vst1_u32( reinterpret_cast<uint32_t*>(pDestination), vL ); + vst1q_lane_u32( reinterpret_cast<uint32_t*>(pDestination)+2, v, 2 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to >=0 + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue,vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult,vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask,g_XMNegativeZero); + vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult,vOverflow); + // Write 3 uints + XMVECTOR T1 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR T2 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vResult ); + _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T1 ); + _mm_store_ss( reinterpret_cast<float*>(&pDestination->z), T2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreInt4 +( + uint32_t* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; + pDestination[3] = V.vector4_u32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_u32( pDestination, V ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreInt4A +( + uint32_t* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + assert(((uintptr_t)pDestination & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; + pDestination[3] = V.vector4_u32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_u32_ex( pDestination, V, 128 ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreFloat4 +( + XMFLOAT4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; + pDestination->w = V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_f32( reinterpret_cast<float*>(pDestination), V ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_storeu_ps( &pDestination->x, V ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreFloat4A +( + XMFLOAT4A* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + assert(((uintptr_t)pDestination & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; + pDestination->w = V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_f32_ex( reinterpret_cast<float*>(pDestination), V, 128 ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ps( &pDestination->x, V ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreSInt4 +( + XMINT4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = (int32_t)V.vector4_f32[0]; + pDestination->y = (int32_t)V.vector4_f32[1]; + pDestination->z = (int32_t)V.vector4_f32[2]; + pDestination->w = (int32_t)V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 v = vcvtq_s32_f32(V); + vst1q_s32( reinterpret_cast<int32_t*>(pDestination), v ); +#elif defined(_XM_SSE_INTRINSICS_) + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(V); + // If there was positive overflow, set to 0x7FFFFFFF + XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti)); + vOverflow = _mm_or_ps(vOverflow,vResult); + _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vOverflow) ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreUInt4 +( + XMUINT4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = (uint32_t)V.vector4_f32[0]; + pDestination->y = (uint32_t)V.vector4_f32[1]; + pDestination->z = (uint32_t)V.vector4_f32[2]; + pDestination->w = (uint32_t)V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 v = vcvtq_u32_f32(V); + vst1q_u32( reinterpret_cast<uint32_t*>(pDestination), v ); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to >=0 + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue,vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult,vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask,g_XMNegativeZero); + vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult,vOverflow); + _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vResult) ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreFloat3x3 +( + XMFLOAT3X3* pDestination, + CXMMATRIX M +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 T1 = vextq_f32( M.r[0], M.r[1], 1 ); + __n128 T2 = vbslq_f32( g_XMMask3, M.r[0], T1 ); + vst1q_f32( &pDestination->m[0][0], T2 ); + + T1 = vextq_f32( M.r[1], M.r[1], 1 ); + T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) ); + vst1q_f32( &pDestination->m[1][1], T2 ); + + vst1q_lane_f32( &pDestination->m[2][2], M.r[2], 2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp1 = M.r[0]; + XMVECTOR vTemp2 = M.r[1]; + XMVECTOR vTemp3 = M.r[2]; + XMVECTOR vWork = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,0,2,2)); + vTemp1 = _mm_shuffle_ps(vTemp1,vWork,_MM_SHUFFLE(2,0,1,0)); + _mm_storeu_ps(&pDestination->m[0][0],vTemp1); + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1)); + _mm_storeu_ps(&pDestination->m[1][1],vTemp2); + vTemp3 = XM_PERMUTE_PS(vTemp3,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss(&pDestination->m[2][2],vTemp3); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreFloat4x3 +( + XMFLOAT4X3* pDestination, + CXMMATRIX M +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 T1 = vextq_f32( M.r[0], M.r[1], 1 ); + __n128 T2 = vbslq_f32( g_XMMask3, M.r[0], T1 ); + vst1q_f32( &pDestination->m[0][0], T2 ); + + T1 = vextq_f32( M.r[1], M.r[1], 1 ); + T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) ); + vst1q_f32( &pDestination->m[1][1], T2 ); + + T1 = vdupq_lane_f32( vget_high_f32( M.r[2] ), 0 ); + T2 = vextq_f32( T1, M.r[3], 3 ); + vst1q_f32( &pDestination->m[2][2], T2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp1 = M.r[0]; + XMVECTOR vTemp2 = M.r[1]; + XMVECTOR vTemp3 = M.r[2]; + XMVECTOR vTemp4 = M.r[3]; + XMVECTOR vTemp2x = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1)); + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(2,2,0,0)); + vTemp1 = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,2,1,0)); + vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2)); + vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0)); + _mm_storeu_ps(&pDestination->m[0][0],vTemp1); + _mm_storeu_ps(&pDestination->m[1][1],vTemp2x); + _mm_storeu_ps(&pDestination->m[2][2],vTemp3); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreFloat4x3A +( + XMFLOAT4X3A* pDestination, + CXMMATRIX M +) +{ + assert(pDestination); + assert(((uintptr_t)pDestination & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 T1 = vextq_f32( M.r[0], M.r[1], 1 ); + __n128 T2 = vbslq_f32( g_XMMask3, M.r[0], T1 ); + vst1q_f32_ex( &pDestination->m[0][0], T2, 128 ); + + T1 = vextq_f32( M.r[1], M.r[1], 1 ); + T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) ); + vst1q_f32_ex( &pDestination->m[1][1], T2, 128 ); + + T1 = vdupq_lane_f32( vget_high_f32( M.r[2] ), 0 ); + T2 = vextq_f32( T1, M.r[3], 3 ); + vst1q_f32_ex( &pDestination->m[2][2], T2, 128 ); +#elif defined(_XM_SSE_INTRINSICS_) + // x1,y1,z1,w1 + XMVECTOR vTemp1 = M.r[0]; + // x2,y2,z2,w2 + XMVECTOR vTemp2 = M.r[1]; + // x3,y3,z3,w3 + XMVECTOR vTemp3 = M.r[2]; + // x4,y4,z4,w4 + XMVECTOR vTemp4 = M.r[3]; + // z1,z1,x2,y2 + XMVECTOR vTemp = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(1,0,2,2)); + // y2,z2,x3,y3 (Final) + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1)); + // x1,y1,z1,x2 (Final) + vTemp1 = _mm_shuffle_ps(vTemp1,vTemp,_MM_SHUFFLE(2,0,1,0)); + // z3,z3,x4,x4 + vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2)); + // z3,x4,y4,z4 (Final) + vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0)); + // Store in 3 operations + _mm_store_ps(&pDestination->m[0][0],vTemp1); + _mm_store_ps(&pDestination->m[1][1],vTemp2); + _mm_store_ps(&pDestination->m[2][2],vTemp3); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreFloat4x4 +( + XMFLOAT4X4* pDestination, + CXMMATRIX M +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + pDestination->m[0][3] = M.r[0].vector4_f32[3]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + pDestination->m[1][3] = M.r[1].vector4_f32[3]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + pDestination->m[2][3] = M.r[2].vector4_f32[3]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + pDestination->m[3][3] = M.r[3].vector4_f32[3]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_f32( reinterpret_cast<float*>(&pDestination->_11), M.r[0] ); + vst1q_f32( reinterpret_cast<float*>(&pDestination->_21), M.r[1] ); + vst1q_f32( reinterpret_cast<float*>(&pDestination->_31), M.r[2] ); + vst1q_f32( reinterpret_cast<float*>(&pDestination->_41), M.r[3] ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_storeu_ps( &pDestination->_11, M.r[0] ); + _mm_storeu_ps( &pDestination->_21, M.r[1] ); + _mm_storeu_ps( &pDestination->_31, M.r[2] ); + _mm_storeu_ps( &pDestination->_41, M.r[3] ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreFloat4x4A +( + XMFLOAT4X4A* pDestination, + CXMMATRIX M +) +{ + assert(pDestination); + assert(((uintptr_t)pDestination & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + pDestination->m[0][3] = M.r[0].vector4_f32[3]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + pDestination->m[1][3] = M.r[1].vector4_f32[3]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + pDestination->m[2][3] = M.r[2].vector4_f32[3]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + pDestination->m[3][3] = M.r[3].vector4_f32[3]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_11), M.r[0], 128 ); + vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_21), M.r[1], 128 ); + vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_31), M.r[2], 128 ); + vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_41), M.r[3], 128 ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ps( &pDestination->_11, M.r[0] ); + _mm_store_ps( &pDestination->_21, M.r[1] ); + _mm_store_ps( &pDestination->_31, M.r[2] ); + _mm_store_ps( &pDestination->_41, M.r[3] ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + diff --git a/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXMathMatrix.inl b/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXMathMatrix.inl new file mode 100644 index 00000000..d665d333 --- /dev/null +++ b/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXMathMatrix.inl @@ -0,0 +1,3414 @@ +//------------------------------------------------------------------------------------- +// DirectXMathMatrix.inl -- SIMD C++ Math library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------------------------------- + +#ifdef _MSC_VER +#pragma once +#endif + +/**************************************************************************** + * + * Matrix + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +// Return true if any entry in the matrix is NaN +inline bool XMMatrixIsNaN +( + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + size_t i = 16; + const uint32_t *pWork = (const uint32_t *)(&M.m[0][0]); + do { + // Fetch value into integer unit + uint32_t uTest = pWork[0]; + // Remove sign + uTest &= 0x7FFFFFFFU; + // NaN is 0x7F800001 through 0x7FFFFFFF inclusive + uTest -= 0x7F800001U; + if (uTest<0x007FFFFFU) { + break; // NaN found + } + ++pWork; // Next entry + } while (--i); + return (i!=0); // i == 0 if nothing matched +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Load in registers + XMVECTOR vX = M.r[0]; + XMVECTOR vY = M.r[1]; + XMVECTOR vZ = M.r[2]; + XMVECTOR vW = M.r[3]; + // Test themselves to check for NaN + vX = vmvnq_u32(vceqq_f32(vX, vX)); + vY = vmvnq_u32(vceqq_f32(vY, vY)); + vZ = vmvnq_u32(vceqq_f32(vZ, vZ)); + vW = vmvnq_u32(vceqq_f32(vW, vW)); + // Or all the results + vX = vorrq_u32(vX,vZ); + vY = vorrq_u32(vY,vW); + vX = vorrq_u32(vX,vY); + // If any tested true, return true + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vX), vget_high_u8(vX)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + return (r != 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Load in registers + XMVECTOR vX = M.r[0]; + XMVECTOR vY = M.r[1]; + XMVECTOR vZ = M.r[2]; + XMVECTOR vW = M.r[3]; + // Test themselves to check for NaN + vX = _mm_cmpneq_ps(vX,vX); + vY = _mm_cmpneq_ps(vY,vY); + vZ = _mm_cmpneq_ps(vZ,vZ); + vW = _mm_cmpneq_ps(vW,vW); + // Or all the results + vX = _mm_or_ps(vX,vZ); + vY = _mm_or_ps(vY,vW); + vX = _mm_or_ps(vX,vY); + // If any tested true, return true + return (_mm_movemask_ps(vX)!=0); +#else +#endif +} + +//------------------------------------------------------------------------------ + +// Return true if any entry in the matrix is +/-INF +inline bool XMMatrixIsInfinite +( + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + size_t i = 16; + const uint32_t *pWork = (const uint32_t *)(&M.m[0][0]); + do { + // Fetch value into integer unit + uint32_t uTest = pWork[0]; + // Remove sign + uTest &= 0x7FFFFFFFU; + // INF is 0x7F800000 + if (uTest==0x7F800000U) { + break; // INF found + } + ++pWork; // Next entry + } while (--i); + return (i!=0); // i == 0 if nothing matched +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Mask off the sign bits + XMVECTOR vTemp1 = vandq_u32(M.r[0],g_XMAbsMask); + XMVECTOR vTemp2 = vandq_u32(M.r[1],g_XMAbsMask); + XMVECTOR vTemp3 = vandq_u32(M.r[2],g_XMAbsMask); + XMVECTOR vTemp4 = vandq_u32(M.r[3],g_XMAbsMask); + // Compare to infinity + vTemp1 = vceqq_f32(vTemp1,g_XMInfinity); + vTemp2 = vceqq_f32(vTemp2,g_XMInfinity); + vTemp3 = vceqq_f32(vTemp3,g_XMInfinity); + vTemp4 = vceqq_f32(vTemp4,g_XMInfinity); + // Or the answers together + vTemp1 = vorrq_u32(vTemp1,vTemp2); + vTemp3 = vorrq_u32(vTemp3,vTemp4); + vTemp1 = vorrq_u32(vTemp1,vTemp3); + // If any are infinity, the signs are true. + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + return (r != 0); +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bits + XMVECTOR vTemp1 = _mm_and_ps(M.r[0],g_XMAbsMask); + XMVECTOR vTemp2 = _mm_and_ps(M.r[1],g_XMAbsMask); + XMVECTOR vTemp3 = _mm_and_ps(M.r[2],g_XMAbsMask); + XMVECTOR vTemp4 = _mm_and_ps(M.r[3],g_XMAbsMask); + // Compare to infinity + vTemp1 = _mm_cmpeq_ps(vTemp1,g_XMInfinity); + vTemp2 = _mm_cmpeq_ps(vTemp2,g_XMInfinity); + vTemp3 = _mm_cmpeq_ps(vTemp3,g_XMInfinity); + vTemp4 = _mm_cmpeq_ps(vTemp4,g_XMInfinity); + // Or the answers together + vTemp1 = _mm_or_ps(vTemp1,vTemp2); + vTemp3 = _mm_or_ps(vTemp3,vTemp4); + vTemp1 = _mm_or_ps(vTemp1,vTemp3); + // If any are infinity, the signs are true. + return (_mm_movemask_ps(vTemp1)!=0); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Return true if the XMMatrix is equal to identity +inline bool XMMatrixIsIdentity +( + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + // Use the integer pipeline to reduce branching to a minimum + const uint32_t *pWork = (const uint32_t*)(&M.m[0][0]); + // Convert 1.0f to zero and or them together + uint32_t uOne = pWork[0]^0x3F800000U; + // Or all the 0.0f entries together + uint32_t uZero = pWork[1]; + uZero |= pWork[2]; + uZero |= pWork[3]; + // 2nd row + uZero |= pWork[4]; + uOne |= pWork[5]^0x3F800000U; + uZero |= pWork[6]; + uZero |= pWork[7]; + // 3rd row + uZero |= pWork[8]; + uZero |= pWork[9]; + uOne |= pWork[10]^0x3F800000U; + uZero |= pWork[11]; + // 4th row + uZero |= pWork[12]; + uZero |= pWork[13]; + uZero |= pWork[14]; + uOne |= pWork[15]^0x3F800000U; + // If all zero entries are zero, the uZero==0 + uZero &= 0x7FFFFFFF; // Allow -0.0f + // If all 1.0f entries are 1.0f, then uOne==0 + uOne |= uZero; + return (uOne==0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vTemp1 = vceqq_f32(M.r[0],g_XMIdentityR0); + XMVECTOR vTemp2 = vceqq_f32(M.r[1],g_XMIdentityR1); + XMVECTOR vTemp3 = vceqq_f32(M.r[2],g_XMIdentityR2); + XMVECTOR vTemp4 = vceqq_f32(M.r[3],g_XMIdentityR3); + vTemp1 = vandq_u32(vTemp1,vTemp2); + vTemp3 = vandq_u32(vTemp3,vTemp4); + vTemp1 = vandq_u32(vTemp1,vTemp3); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + return ( r == 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp1 = _mm_cmpeq_ps(M.r[0],g_XMIdentityR0); + XMVECTOR vTemp2 = _mm_cmpeq_ps(M.r[1],g_XMIdentityR1); + XMVECTOR vTemp3 = _mm_cmpeq_ps(M.r[2],g_XMIdentityR2); + XMVECTOR vTemp4 = _mm_cmpeq_ps(M.r[3],g_XMIdentityR3); + vTemp1 = _mm_and_ps(vTemp1,vTemp2); + vTemp3 = _mm_and_ps(vTemp3,vTemp4); + vTemp1 = _mm_and_ps(vTemp1,vTemp3); + return (_mm_movemask_ps(vTemp1)==0x0f); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// Perform a 4x4 matrix multiply by a 4x4 matrix +inline XMMATRIX XMMatrixMultiply +( + CXMMATRIX M1, + CXMMATRIX M2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMMATRIX mResult; + // Cache the invariants in registers + float x = M1.m[0][0]; + float y = M1.m[0][1]; + float z = M1.m[0][2]; + float w = M1.m[0][3]; + // Perform the operation on the first row + mResult.m[0][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w); + mResult.m[0][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w); + mResult.m[0][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w); + mResult.m[0][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w); + // Repeat for all the other rows + x = M1.m[1][0]; + y = M1.m[1][1]; + z = M1.m[1][2]; + w = M1.m[1][3]; + mResult.m[1][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w); + mResult.m[1][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w); + mResult.m[1][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w); + mResult.m[1][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w); + x = M1.m[2][0]; + y = M1.m[2][1]; + z = M1.m[2][2]; + w = M1.m[2][3]; + mResult.m[2][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w); + mResult.m[2][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w); + mResult.m[2][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w); + mResult.m[2][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w); + x = M1.m[3][0]; + y = M1.m[3][1]; + z = M1.m[3][2]; + w = M1.m[3][3]; + mResult.m[3][0] = (M2.m[0][0]*x)+(M2.m[1][0]*y)+(M2.m[2][0]*z)+(M2.m[3][0]*w); + mResult.m[3][1] = (M2.m[0][1]*x)+(M2.m[1][1]*y)+(M2.m[2][1]*z)+(M2.m[3][1]*w); + mResult.m[3][2] = (M2.m[0][2]*x)+(M2.m[1][2]*y)+(M2.m[2][2]*z)+(M2.m[3][2]*w); + mResult.m[3][3] = (M2.m[0][3]*x)+(M2.m[1][3]*y)+(M2.m[2][3]*z)+(M2.m[3][3]*w); + return mResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX mResult; + __n64 VL = vget_low_f32( M1.r[0] ); + __n64 VH = vget_high_f32( M1.r[0] ); + // Splat the component X,Y,Z then W + XMVECTOR vX = vdupq_lane_f32(VL, 0); + XMVECTOR vY = vdupq_lane_f32(VL, 1); + XMVECTOR vZ = vdupq_lane_f32(VH, 0); + XMVECTOR vW = vdupq_lane_f32(VH, 1); + // Perform the operation on the first row + vX = vmulq_f32(vX,M2.r[0]); + vY = vmulq_f32(vY,M2.r[1]); + vZ = vmlaq_f32(vX,vZ,M2.r[2]); + vW = vmlaq_f32(vY,vW,M2.r[3]); + mResult.r[0] = vaddq_f32( vZ, vW ); + // Repeat for the other 3 rows + VL = vget_low_f32( M1.r[1] ); + VH = vget_high_f32( M1.r[1] ); + vX = vdupq_lane_f32(VL, 0); + vY = vdupq_lane_f32(VL, 1); + vZ = vdupq_lane_f32(VH, 0); + vW = vdupq_lane_f32(VH, 1); + vX = vmulq_f32(vX,M2.r[0]); + vY = vmulq_f32(vY,M2.r[1]); + vZ = vmlaq_f32(vX,vZ,M2.r[2]); + vW = vmlaq_f32(vY,vW,M2.r[3]); + mResult.r[1] = vaddq_f32( vZ, vW ); + VL = vget_low_f32( M1.r[2] ); + VH = vget_high_f32( M1.r[2] ); + vX = vdupq_lane_f32(VL, 0); + vY = vdupq_lane_f32(VL, 1); + vZ = vdupq_lane_f32(VH, 0); + vW = vdupq_lane_f32(VH, 1); + vX = vmulq_f32(vX,M2.r[0]); + vY = vmulq_f32(vY,M2.r[1]); + vZ = vmlaq_f32(vX,vZ,M2.r[2]); + vW = vmlaq_f32(vY,vW,M2.r[3]); + mResult.r[2] = vaddq_f32( vZ, vW ); + VL = vget_low_f32( M1.r[3] ); + VH = vget_high_f32( M1.r[3] ); + vX = vdupq_lane_f32(VL, 0); + vY = vdupq_lane_f32(VL, 1); + vZ = vdupq_lane_f32(VH, 0); + vW = vdupq_lane_f32(VH, 1); + vX = vmulq_f32(vX,M2.r[0]); + vY = vmulq_f32(vY,M2.r[1]); + vZ = vmlaq_f32(vX,vZ,M2.r[2]); + vW = vmlaq_f32(vY,vW,M2.r[3]); + mResult.r[3] = vaddq_f32( vZ, vW ); + return mResult; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX mResult; + // Use vW to hold the original row + XMVECTOR vW = M1.r[0]; + // Splat the component X,Y,Z then W + XMVECTOR vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); + vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); + // Perform the operation on the first row + vX = _mm_mul_ps(vX,M2.r[0]); + vY = _mm_mul_ps(vY,M2.r[1]); + vZ = _mm_mul_ps(vZ,M2.r[2]); + vW = _mm_mul_ps(vW,M2.r[3]); + // Perform a binary add to reduce cumulative errors + vX = _mm_add_ps(vX,vZ); + vY = _mm_add_ps(vY,vW); + vX = _mm_add_ps(vX,vY); + mResult.r[0] = vX; + // Repeat for the other 3 rows + vW = M1.r[1]; + vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); + vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); + vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vY = _mm_mul_ps(vY,M2.r[1]); + vZ = _mm_mul_ps(vZ,M2.r[2]); + vW = _mm_mul_ps(vW,M2.r[3]); + vX = _mm_add_ps(vX,vZ); + vY = _mm_add_ps(vY,vW); + vX = _mm_add_ps(vX,vY); + mResult.r[1] = vX; + vW = M1.r[2]; + vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); + vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); + vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vY = _mm_mul_ps(vY,M2.r[1]); + vZ = _mm_mul_ps(vZ,M2.r[2]); + vW = _mm_mul_ps(vW,M2.r[3]); + vX = _mm_add_ps(vX,vZ); + vY = _mm_add_ps(vY,vW); + vX = _mm_add_ps(vX,vY); + mResult.r[2] = vX; + vW = M1.r[3]; + vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); + vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); + vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vY = _mm_mul_ps(vY,M2.r[1]); + vZ = _mm_mul_ps(vZ,M2.r[2]); + vW = _mm_mul_ps(vW,M2.r[3]); + vX = _mm_add_ps(vX,vZ); + vY = _mm_add_ps(vY,vW); + vX = _mm_add_ps(vX,vY); + mResult.r[3] = vX; + return mResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixMultiplyTranspose +( + CXMMATRIX M1, + CXMMATRIX M2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMMATRIX mResult; + // Cache the invariants in registers + float x = M2.m[0][0]; + float y = M2.m[1][0]; + float z = M2.m[2][0]; + float w = M2.m[3][0]; + // Perform the operation on the first row + mResult.m[0][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w); + mResult.m[0][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w); + mResult.m[0][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w); + mResult.m[0][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w); + // Repeat for all the other rows + x = M2.m[0][1]; + y = M2.m[1][1]; + z = M2.m[2][1]; + w = M2.m[3][1]; + mResult.m[1][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w); + mResult.m[1][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w); + mResult.m[1][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w); + mResult.m[1][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w); + x = M2.m[0][2]; + y = M2.m[1][2]; + z = M2.m[2][2]; + w = M2.m[3][2]; + mResult.m[2][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w); + mResult.m[2][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w); + mResult.m[2][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w); + mResult.m[2][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w); + x = M2.m[0][3]; + y = M2.m[1][3]; + z = M2.m[2][3]; + w = M2.m[3][3]; + mResult.m[3][0] = (M1.m[0][0]*x)+(M1.m[0][1]*y)+(M1.m[0][2]*z)+(M1.m[0][3]*w); + mResult.m[3][1] = (M1.m[1][0]*x)+(M1.m[1][1]*y)+(M1.m[1][2]*z)+(M1.m[1][3]*w); + mResult.m[3][2] = (M1.m[2][0]*x)+(M1.m[2][1]*y)+(M1.m[2][2]*z)+(M1.m[2][3]*w); + mResult.m[3][3] = (M1.m[3][0]*x)+(M1.m[3][1]*y)+(M1.m[3][2]*z)+(M1.m[3][3]*w); + return mResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 VL = vget_low_f32( M1.r[0] ); + __n64 VH = vget_high_f32( M1.r[0] ); + // Splat the component X,Y,Z then W + XMVECTOR vX = vdupq_lane_f32(VL, 0); + XMVECTOR vY = vdupq_lane_f32(VL, 1); + XMVECTOR vZ = vdupq_lane_f32(VH, 0); + XMVECTOR vW = vdupq_lane_f32(VH, 1); + // Perform the operation on the first row + vX = vmulq_f32(vX,M2.r[0]); + vY = vmulq_f32(vY,M2.r[1]); + vZ = vmlaq_f32(vX,vZ,M2.r[2]); + vW = vmlaq_f32(vY,vW,M2.r[3]); + __n128 r0 = vaddq_f32( vZ, vW ); + // Repeat for the other 3 rows + VL = vget_low_f32( M1.r[1] ); + VH = vget_high_f32( M1.r[1] ); + vX = vdupq_lane_f32(VL, 0); + vY = vdupq_lane_f32(VL, 1); + vZ = vdupq_lane_f32(VH, 0); + vW = vdupq_lane_f32(VH, 1); + vX = vmulq_f32(vX,M2.r[0]); + vY = vmulq_f32(vY,M2.r[1]); + vZ = vmlaq_f32(vX,vZ,M2.r[2]); + vW = vmlaq_f32(vY,vW,M2.r[3]); + __n128 r1 = vaddq_f32( vZ, vW ); + VL = vget_low_f32( M1.r[2] ); + VH = vget_high_f32( M1.r[2] ); + vX = vdupq_lane_f32(VL, 0); + vY = vdupq_lane_f32(VL, 1); + vZ = vdupq_lane_f32(VH, 0); + vW = vdupq_lane_f32(VH, 1); + vX = vmulq_f32(vX,M2.r[0]); + vY = vmulq_f32(vY,M2.r[1]); + vZ = vmlaq_f32(vX,vZ,M2.r[2]); + vW = vmlaq_f32(vY,vW,M2.r[3]); + __n128 r2 = vaddq_f32( vZ, vW ); + VL = vget_low_f32( M1.r[3] ); + VH = vget_high_f32( M1.r[3] ); + vX = vdupq_lane_f32(VL, 0); + vY = vdupq_lane_f32(VL, 1); + vZ = vdupq_lane_f32(VH, 0); + vW = vdupq_lane_f32(VH, 1); + vX = vmulq_f32(vX,M2.r[0]); + vY = vmulq_f32(vY,M2.r[1]); + vZ = vmlaq_f32(vX,vZ,M2.r[2]); + vW = vmlaq_f32(vY,vW,M2.r[3]); + __n128 r3 = vaddq_f32( vZ, vW ); + + // Transpose result + float32x4x2_t P0 = vzipq_f32( r0, r2 ); + float32x4x2_t P1 = vzipq_f32( r1, r3 ); + + float32x4x2_t T0 = vzipq_f32( P0.val[0], P1.val[0] ); + float32x4x2_t T1 = vzipq_f32( P0.val[1], P1.val[1] ); + + XMMATRIX mResult; + mResult.r[0] = T0.val[0]; + mResult.r[1] = T0.val[1]; + mResult.r[2] = T1.val[0]; + mResult.r[3] = T1.val[1]; + return mResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Use vW to hold the original row + XMVECTOR vW = M1.r[0]; + // Splat the component X,Y,Z then W + XMVECTOR vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); + vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); + // Perform the operation on the first row + vX = _mm_mul_ps(vX,M2.r[0]); + vY = _mm_mul_ps(vY,M2.r[1]); + vZ = _mm_mul_ps(vZ,M2.r[2]); + vW = _mm_mul_ps(vW,M2.r[3]); + // Perform a binary add to reduce cumulative errors + vX = _mm_add_ps(vX,vZ); + vY = _mm_add_ps(vY,vW); + vX = _mm_add_ps(vX,vY); + __m128 r0 = vX; + // Repeat for the other 3 rows + vW = M1.r[1]; + vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); + vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); + vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vY = _mm_mul_ps(vY,M2.r[1]); + vZ = _mm_mul_ps(vZ,M2.r[2]); + vW = _mm_mul_ps(vW,M2.r[3]); + vX = _mm_add_ps(vX,vZ); + vY = _mm_add_ps(vY,vW); + vX = _mm_add_ps(vX,vY); + __m128 r1 = vX; + vW = M1.r[2]; + vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); + vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); + vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vY = _mm_mul_ps(vY,M2.r[1]); + vZ = _mm_mul_ps(vZ,M2.r[2]); + vW = _mm_mul_ps(vW,M2.r[3]); + vX = _mm_add_ps(vX,vZ); + vY = _mm_add_ps(vY,vW); + vX = _mm_add_ps(vX,vY); + __m128 r2 = vX; + vW = M1.r[3]; + vX = XM_PERMUTE_PS(vW,_MM_SHUFFLE(0,0,0,0)); + vY = XM_PERMUTE_PS(vW,_MM_SHUFFLE(1,1,1,1)); + vZ = XM_PERMUTE_PS(vW,_MM_SHUFFLE(2,2,2,2)); + vW = XM_PERMUTE_PS(vW,_MM_SHUFFLE(3,3,3,3)); + vX = _mm_mul_ps(vX,M2.r[0]); + vY = _mm_mul_ps(vY,M2.r[1]); + vZ = _mm_mul_ps(vZ,M2.r[2]); + vW = _mm_mul_ps(vW,M2.r[3]); + vX = _mm_add_ps(vX,vZ); + vY = _mm_add_ps(vY,vW); + vX = _mm_add_ps(vX,vY); + __m128 r3 = vX; + + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(1,0,1,0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(r0,r1,_MM_SHUFFLE(3,2,3,2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(1,0,1,0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(r2,r3,_MM_SHUFFLE(3,2,3,2)); + + XMMATRIX mResult; + // x.x,y.x,z.x,w.x + mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0)); + // x.y,y.y,z.y,w.y + mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1)); + // x.z,y.z,z.z,w.z + mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0)); + // x.w,y.w,z.w,w.w + mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1)); + return mResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixTranspose +( + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + // Original matrix: + // + // m00m01m02m03 + // m10m11m12m13 + // m20m21m22m23 + // m30m31m32m33 + + XMMATRIX P; + P.r[0] = XMVectorMergeXY(M.r[0], M.r[2]); // m00m20m01m21 + P.r[1] = XMVectorMergeXY(M.r[1], M.r[3]); // m10m30m11m31 + P.r[2] = XMVectorMergeZW(M.r[0], M.r[2]); // m02m22m03m23 + P.r[3] = XMVectorMergeZW(M.r[1], M.r[3]); // m12m32m13m33 + + XMMATRIX MT; + MT.r[0] = XMVectorMergeXY(P.r[0], P.r[1]); // m00m10m20m30 + MT.r[1] = XMVectorMergeZW(P.r[0], P.r[1]); // m01m11m21m31 + MT.r[2] = XMVectorMergeXY(P.r[2], P.r[3]); // m02m12m22m32 + MT.r[3] = XMVectorMergeZW(P.r[2], P.r[3]); // m03m13m23m33 + return MT; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float32x4x2_t P0 = vzipq_f32( M.r[0], M.r[2] ); + float32x4x2_t P1 = vzipq_f32( M.r[1], M.r[3] ); + + float32x4x2_t T0 = vzipq_f32( P0.val[0], P1.val[0] ); + float32x4x2_t T1 = vzipq_f32( P0.val[1], P1.val[1] ); + + XMMATRIX mResult; + mResult.r[0] = T0.val[0]; + mResult.r[1] = T0.val[1]; + mResult.r[2] = T1.val[0]; + mResult.r[3] = T1.val[1]; + return mResult; +#elif defined(_XM_SSE_INTRINSICS_) + // x.x,x.y,y.x,y.y + XMVECTOR vTemp1 = _mm_shuffle_ps(M.r[0],M.r[1],_MM_SHUFFLE(1,0,1,0)); + // x.z,x.w,y.z,y.w + XMVECTOR vTemp3 = _mm_shuffle_ps(M.r[0],M.r[1],_MM_SHUFFLE(3,2,3,2)); + // z.x,z.y,w.x,w.y + XMVECTOR vTemp2 = _mm_shuffle_ps(M.r[2],M.r[3],_MM_SHUFFLE(1,0,1,0)); + // z.z,z.w,w.z,w.w + XMVECTOR vTemp4 = _mm_shuffle_ps(M.r[2],M.r[3],_MM_SHUFFLE(3,2,3,2)); + XMMATRIX mResult; + + // x.x,y.x,z.x,w.x + mResult.r[0] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(2,0,2,0)); + // x.y,y.y,z.y,w.y + mResult.r[1] = _mm_shuffle_ps(vTemp1, vTemp2,_MM_SHUFFLE(3,1,3,1)); + // x.z,y.z,z.z,w.z + mResult.r[2] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(2,0,2,0)); + // x.w,y.w,z.w,w.w + mResult.r[3] = _mm_shuffle_ps(vTemp3, vTemp4,_MM_SHUFFLE(3,1,3,1)); + return mResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Return the inverse and the determinant of a 4x4 matrix +_Use_decl_annotations_ +inline XMMATRIX XMMatrixInverse +( + XMVECTOR* pDeterminant, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMMATRIX MT = XMMatrixTranspose(M); + + XMVECTOR V0[4], V1[4]; + V0[0] = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(MT.r[2]); + V1[0] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W>(MT.r[3]); + V0[1] = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(MT.r[0]); + V1[1] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W>(MT.r[1]); + V0[2] = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_0Z, XM_PERMUTE_1X, XM_PERMUTE_1Z>(MT.r[2], MT.r[0]); + V1[2] = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_0W, XM_PERMUTE_1Y, XM_PERMUTE_1W>(MT.r[3], MT.r[1]); + + XMVECTOR D0 = XMVectorMultiply(V0[0], V1[0]); + XMVECTOR D1 = XMVectorMultiply(V0[1], V1[1]); + XMVECTOR D2 = XMVectorMultiply(V0[2], V1[2]); + + V0[0] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W>(MT.r[2]); + V1[0] = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(MT.r[3]); + V0[1] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W>(MT.r[0]); + V1[1] = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(MT.r[1]); + V0[2] = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_0W, XM_PERMUTE_1Y, XM_PERMUTE_1W>(MT.r[2], MT.r[0]); + V1[2] = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_0Z, XM_PERMUTE_1X, XM_PERMUTE_1Z>(MT.r[3], MT.r[1]); + + D0 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], D0); + D1 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], D1); + D2 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], D2); + + V0[0] = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_Y>(MT.r[1]); + V1[0] = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_0W, XM_PERMUTE_0X>(D0, D2); + V0[1] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_X>(MT.r[0]); + V1[1] = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_0Z>(D0, D2); + V0[2] = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_Y>(MT.r[3]); + V1[2] = XMVectorPermute<XM_PERMUTE_1W, XM_PERMUTE_0Y, XM_PERMUTE_0W, XM_PERMUTE_0X>(D1, D2); + V0[3] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_X>(MT.r[2]); + V1[3] = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_1W, XM_PERMUTE_0Y, XM_PERMUTE_0Z>(D1, D2); + + XMVECTOR C0 = XMVectorMultiply(V0[0], V1[0]); + XMVECTOR C2 = XMVectorMultiply(V0[1], V1[1]); + XMVECTOR C4 = XMVectorMultiply(V0[2], V1[2]); + XMVECTOR C6 = XMVectorMultiply(V0[3], V1[3]); + + V0[0] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Y, XM_SWIZZLE_Z>(MT.r[1]); + V1[0] = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_1X>(D0, D2); + V0[1] = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Y>(MT.r[0]); + V1[1] = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_0X>(D0, D2); + V0[2] = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Y, XM_SWIZZLE_Z>(MT.r[3]); + V1[2] = XMVectorPermute<XM_PERMUTE_0W, XM_PERMUTE_0X, XM_PERMUTE_0Y, XM_PERMUTE_1Z>(D1, D2); + V0[3] = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_Z, XM_SWIZZLE_W, XM_SWIZZLE_Y>(MT.r[2]); + V1[3] = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_1Z, XM_PERMUTE_0X>(D1, D2); + + C0 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], C0); + C2 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], C2); + C4 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], C4); + C6 = XMVectorNegativeMultiplySubtract(V0[3], V1[3], C6); + + V0[0] = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_X>(MT.r[1]); + V1[0] = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_1Y, XM_PERMUTE_1X, XM_PERMUTE_0Z>(D0, D2); + V0[1] = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Z>(MT.r[0]); + V1[1] = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_1X>(D0, D2); + V0[2] = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_W, XM_SWIZZLE_X>(MT.r[3]); + V1[2] = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_1W, XM_PERMUTE_1Z, XM_PERMUTE_0Z>(D1, D2); + V0[3] = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_W, XM_SWIZZLE_X, XM_SWIZZLE_Z>(MT.r[2]); + V1[3] = XMVectorPermute<XM_PERMUTE_1W, XM_PERMUTE_0X, XM_PERMUTE_0W, XM_PERMUTE_1Z>(D1, D2); + + XMVECTOR C1 = XMVectorNegativeMultiplySubtract(V0[0], V1[0], C0); + C0 = XMVectorMultiplyAdd(V0[0], V1[0], C0); + XMVECTOR C3 = XMVectorMultiplyAdd(V0[1], V1[1], C2); + C2 = XMVectorNegativeMultiplySubtract(V0[1], V1[1], C2); + XMVECTOR C5 = XMVectorNegativeMultiplySubtract(V0[2], V1[2], C4); + C4 = XMVectorMultiplyAdd(V0[2], V1[2], C4); + XMVECTOR C7 = XMVectorMultiplyAdd(V0[3], V1[3], C6); + C6 = XMVectorNegativeMultiplySubtract(V0[3], V1[3], C6); + + XMMATRIX R; + R.r[0] = XMVectorSelect(C0, C1, g_XMSelect0101.v); + R.r[1] = XMVectorSelect(C2, C3, g_XMSelect0101.v); + R.r[2] = XMVectorSelect(C4, C5, g_XMSelect0101.v); + R.r[3] = XMVectorSelect(C6, C7, g_XMSelect0101.v); + + XMVECTOR Determinant = XMVector4Dot(R.r[0], MT.r[0]); + + if (pDeterminant != NULL) + *pDeterminant = Determinant; + + XMVECTOR Reciprocal = XMVectorReciprocal(Determinant); + + XMMATRIX Result; + Result.r[0] = XMVectorMultiply(R.r[0], Reciprocal); + Result.r[1] = XMVectorMultiply(R.r[1], Reciprocal); + Result.r[2] = XMVectorMultiply(R.r[2], Reciprocal); + Result.r[3] = XMVectorMultiply(R.r[3], Reciprocal); + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX MT = XMMatrixTranspose(M); + XMVECTOR V00 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(1,1,0,0)); + XMVECTOR V10 = XM_PERMUTE_PS(MT.r[3],_MM_SHUFFLE(3,2,3,2)); + XMVECTOR V01 = XM_PERMUTE_PS(MT.r[0],_MM_SHUFFLE(1,1,0,0)); + XMVECTOR V11 = XM_PERMUTE_PS(MT.r[1],_MM_SHUFFLE(3,2,3,2)); + XMVECTOR V02 = _mm_shuffle_ps(MT.r[2], MT.r[0],_MM_SHUFFLE(2,0,2,0)); + XMVECTOR V12 = _mm_shuffle_ps(MT.r[3], MT.r[1],_MM_SHUFFLE(3,1,3,1)); + + XMVECTOR D0 = _mm_mul_ps(V00,V10); + XMVECTOR D1 = _mm_mul_ps(V01,V11); + XMVECTOR D2 = _mm_mul_ps(V02,V12); + + V00 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(3,2,3,2)); + V10 = XM_PERMUTE_PS(MT.r[3],_MM_SHUFFLE(1,1,0,0)); + V01 = XM_PERMUTE_PS(MT.r[0],_MM_SHUFFLE(3,2,3,2)); + V11 = XM_PERMUTE_PS(MT.r[1],_MM_SHUFFLE(1,1,0,0)); + V02 = _mm_shuffle_ps(MT.r[2],MT.r[0],_MM_SHUFFLE(3,1,3,1)); + V12 = _mm_shuffle_ps(MT.r[3],MT.r[1],_MM_SHUFFLE(2,0,2,0)); + + V00 = _mm_mul_ps(V00,V10); + V01 = _mm_mul_ps(V01,V11); + V02 = _mm_mul_ps(V02,V12); + D0 = _mm_sub_ps(D0,V00); + D1 = _mm_sub_ps(D1,V01); + D2 = _mm_sub_ps(D2,V02); + // V11 = D0Y,D0W,D2Y,D2Y + V11 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(1,1,3,1)); + V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(1,0,2,1)); + V10 = _mm_shuffle_ps(V11,D0,_MM_SHUFFLE(0,3,0,2)); + V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(0,1,0,2)); + V11 = _mm_shuffle_ps(V11,D0,_MM_SHUFFLE(2,1,2,1)); + // V13 = D1Y,D1W,D2W,D2W + XMVECTOR V13 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(3,3,3,1)); + V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(1,0,2,1)); + V12 = _mm_shuffle_ps(V13,D1,_MM_SHUFFLE(0,3,0,2)); + XMVECTOR V03 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(0,1,0,2)); + V13 = _mm_shuffle_ps(V13,D1,_MM_SHUFFLE(2,1,2,1)); + + XMVECTOR C0 = _mm_mul_ps(V00,V10); + XMVECTOR C2 = _mm_mul_ps(V01,V11); + XMVECTOR C4 = _mm_mul_ps(V02,V12); + XMVECTOR C6 = _mm_mul_ps(V03,V13); + + // V11 = D0X,D0Y,D2X,D2X + V11 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(0,0,1,0)); + V00 = XM_PERMUTE_PS(MT.r[1], _MM_SHUFFLE(2,1,3,2)); + V10 = _mm_shuffle_ps(D0,V11,_MM_SHUFFLE(2,1,0,3)); + V01 = XM_PERMUTE_PS(MT.r[0], _MM_SHUFFLE(1,3,2,3)); + V11 = _mm_shuffle_ps(D0,V11,_MM_SHUFFLE(0,2,1,2)); + // V13 = D1X,D1Y,D2Z,D2Z + V13 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(2,2,1,0)); + V02 = XM_PERMUTE_PS(MT.r[3], _MM_SHUFFLE(2,1,3,2)); + V12 = _mm_shuffle_ps(D1,V13,_MM_SHUFFLE(2,1,0,3)); + V03 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(1,3,2,3)); + V13 = _mm_shuffle_ps(D1,V13,_MM_SHUFFLE(0,2,1,2)); + + V00 = _mm_mul_ps(V00,V10); + V01 = _mm_mul_ps(V01,V11); + V02 = _mm_mul_ps(V02,V12); + V03 = _mm_mul_ps(V03,V13); + C0 = _mm_sub_ps(C0,V00); + C2 = _mm_sub_ps(C2,V01); + C4 = _mm_sub_ps(C4,V02); + C6 = _mm_sub_ps(C6,V03); + + V00 = XM_PERMUTE_PS(MT.r[1],_MM_SHUFFLE(0,3,0,3)); + // V10 = D0Z,D0Z,D2X,D2Y + V10 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(1,0,2,2)); + V10 = XM_PERMUTE_PS(V10,_MM_SHUFFLE(0,2,3,0)); + V01 = XM_PERMUTE_PS(MT.r[0],_MM_SHUFFLE(2,0,3,1)); + // V11 = D0X,D0W,D2X,D2Y + V11 = _mm_shuffle_ps(D0,D2,_MM_SHUFFLE(1,0,3,0)); + V11 = XM_PERMUTE_PS(V11,_MM_SHUFFLE(2,1,0,3)); + V02 = XM_PERMUTE_PS(MT.r[3],_MM_SHUFFLE(0,3,0,3)); + // V12 = D1Z,D1Z,D2Z,D2W + V12 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(3,2,2,2)); + V12 = XM_PERMUTE_PS(V12,_MM_SHUFFLE(0,2,3,0)); + V03 = XM_PERMUTE_PS(MT.r[2],_MM_SHUFFLE(2,0,3,1)); + // V13 = D1X,D1W,D2Z,D2W + V13 = _mm_shuffle_ps(D1,D2,_MM_SHUFFLE(3,2,3,0)); + V13 = XM_PERMUTE_PS(V13,_MM_SHUFFLE(2,1,0,3)); + + V00 = _mm_mul_ps(V00,V10); + V01 = _mm_mul_ps(V01,V11); + V02 = _mm_mul_ps(V02,V12); + V03 = _mm_mul_ps(V03,V13); + XMVECTOR C1 = _mm_sub_ps(C0,V00); + C0 = _mm_add_ps(C0,V00); + XMVECTOR C3 = _mm_add_ps(C2,V01); + C2 = _mm_sub_ps(C2,V01); + XMVECTOR C5 = _mm_sub_ps(C4,V02); + C4 = _mm_add_ps(C4,V02); + XMVECTOR C7 = _mm_add_ps(C6,V03); + C6 = _mm_sub_ps(C6,V03); + + C0 = _mm_shuffle_ps(C0,C1,_MM_SHUFFLE(3,1,2,0)); + C2 = _mm_shuffle_ps(C2,C3,_MM_SHUFFLE(3,1,2,0)); + C4 = _mm_shuffle_ps(C4,C5,_MM_SHUFFLE(3,1,2,0)); + C6 = _mm_shuffle_ps(C6,C7,_MM_SHUFFLE(3,1,2,0)); + C0 = XM_PERMUTE_PS(C0,_MM_SHUFFLE(3,1,2,0)); + C2 = XM_PERMUTE_PS(C2,_MM_SHUFFLE(3,1,2,0)); + C4 = XM_PERMUTE_PS(C4,_MM_SHUFFLE(3,1,2,0)); + C6 = XM_PERMUTE_PS(C6,_MM_SHUFFLE(3,1,2,0)); + // Get the determinate + XMVECTOR vTemp = XMVector4Dot(C0,MT.r[0]); + if (pDeterminant != NULL) + *pDeterminant = vTemp; + vTemp = _mm_div_ps(g_XMOne,vTemp); + XMMATRIX mResult; + mResult.r[0] = _mm_mul_ps(C0,vTemp); + mResult.r[1] = _mm_mul_ps(C2,vTemp); + mResult.r[2] = _mm_mul_ps(C4,vTemp); + mResult.r[3] = _mm_mul_ps(C6,vTemp); + return mResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMMatrixDeterminant +( + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 Sign = {1.0f, -1.0f, 1.0f, -1.0f}; + + XMVECTOR V0 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_X>(M.r[2]); + XMVECTOR V1 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(M.r[3]); + XMVECTOR V2 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_X>(M.r[2]); + XMVECTOR V3 = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_Z>(M.r[3]); + XMVECTOR V4 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(M.r[2]); + XMVECTOR V5 = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_Z>(M.r[3]); + + XMVECTOR P0 = XMVectorMultiply(V0, V1); + XMVECTOR P1 = XMVectorMultiply(V2, V3); + XMVECTOR P2 = XMVectorMultiply(V4, V5); + + V0 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(M.r[2]); + V1 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_X>(M.r[3]); + V2 = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_Z>(M.r[2]); + V3 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_X>(M.r[3]); + V4 = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_Z>(M.r[2]); + V5 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(M.r[3]); + + P0 = XMVectorNegativeMultiplySubtract(V0, V1, P0); + P1 = XMVectorNegativeMultiplySubtract(V2, V3, P1); + P2 = XMVectorNegativeMultiplySubtract(V4, V5, P2); + + V0 = XMVectorSwizzle<XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_W, XM_SWIZZLE_Z>(M.r[1]); + V1 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(M.r[1]); + V2 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_X>(M.r[1]); + + XMVECTOR S = XMVectorMultiply(M.r[0], Sign.v); + XMVECTOR R = XMVectorMultiply(V0, P0); + R = XMVectorNegativeMultiplySubtract(V1, P1, R); + R = XMVectorMultiplyAdd(V2, P2, R); + + return XMVector4Dot(S, R); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +#define XM3RANKDECOMPOSE(a, b, c, x, y, z) \ + if((x) < (y)) \ + { \ + if((y) < (z)) \ + { \ + (a) = 2; \ + (b) = 1; \ + (c) = 0; \ + } \ + else \ + { \ + (a) = 1; \ + \ + if((x) < (z)) \ + { \ + (b) = 2; \ + (c) = 0; \ + } \ + else \ + { \ + (b) = 0; \ + (c) = 2; \ + } \ + } \ + } \ + else \ + { \ + if((x) < (z)) \ + { \ + (a) = 2; \ + (b) = 0; \ + (c) = 1; \ + } \ + else \ + { \ + (a) = 0; \ + \ + if((y) < (z)) \ + { \ + (b) = 2; \ + (c) = 1; \ + } \ + else \ + { \ + (b) = 1; \ + (c) = 2; \ + } \ + } \ + } + +#define XM3_DECOMP_EPSILON 0.0001f + +_Use_decl_annotations_ +inline bool XMMatrixDecompose +( + XMVECTOR *outScale, + XMVECTOR *outRotQuat, + XMVECTOR *outTrans, + CXMMATRIX M +) +{ + static const XMVECTOR *pvCanonicalBasis[3] = { + &g_XMIdentityR0.v, + &g_XMIdentityR1.v, + &g_XMIdentityR2.v + }; + + assert( outScale != NULL ); + assert( outRotQuat != NULL ); + assert( outTrans != NULL ); + + // Get the translation + outTrans[0] = M.r[3]; + + XMVECTOR *ppvBasis[3]; + XMMATRIX matTemp; + ppvBasis[0] = &matTemp.r[0]; + ppvBasis[1] = &matTemp.r[1]; + ppvBasis[2] = &matTemp.r[2]; + + matTemp.r[0] = M.r[0]; + matTemp.r[1] = M.r[1]; + matTemp.r[2] = M.r[2]; + matTemp.r[3] = g_XMIdentityR3.v; + + float *pfScales = (float *)outScale; + + size_t a, b, c; + XMVectorGetXPtr(&pfScales[0],XMVector3Length(ppvBasis[0][0])); + XMVectorGetXPtr(&pfScales[1],XMVector3Length(ppvBasis[1][0])); + XMVectorGetXPtr(&pfScales[2],XMVector3Length(ppvBasis[2][0])); + pfScales[3] = 0.f; + + XM3RANKDECOMPOSE(a, b, c, pfScales[0], pfScales[1], pfScales[2]) + + if(pfScales[a] < XM3_DECOMP_EPSILON) + { + ppvBasis[a][0] = pvCanonicalBasis[a][0]; + } + ppvBasis[a][0] = XMVector3Normalize(ppvBasis[a][0]); + + if(pfScales[b] < XM3_DECOMP_EPSILON) + { + size_t aa, bb, cc; + float fAbsX, fAbsY, fAbsZ; + + fAbsX = fabsf(XMVectorGetX(ppvBasis[a][0])); + fAbsY = fabsf(XMVectorGetY(ppvBasis[a][0])); + fAbsZ = fabsf(XMVectorGetZ(ppvBasis[a][0])); + + XM3RANKDECOMPOSE(aa, bb, cc, fAbsX, fAbsY, fAbsZ) + + ppvBasis[b][0] = XMVector3Cross(ppvBasis[a][0],pvCanonicalBasis[cc][0]); + } + + ppvBasis[b][0] = XMVector3Normalize(ppvBasis[b][0]); + + if(pfScales[c] < XM3_DECOMP_EPSILON) + { + ppvBasis[c][0] = XMVector3Cross(ppvBasis[a][0],ppvBasis[b][0]); + } + + ppvBasis[c][0] = XMVector3Normalize(ppvBasis[c][0]); + + float fDet = XMVectorGetX(XMMatrixDeterminant(matTemp)); + + // use Kramer's rule to check for handedness of coordinate system + if(fDet < 0.0f) + { + // switch coordinate system by negating the scale and inverting the basis vector on the x-axis + pfScales[a] = -pfScales[a]; + ppvBasis[a][0] = XMVectorNegate(ppvBasis[a][0]); + + fDet = -fDet; + } + + fDet -= 1.0f; + fDet *= fDet; + + if(XM3_DECOMP_EPSILON < fDet) + { + // Non-SRT matrix encountered + return false; + } + + // generate the quaternion from the matrix + outRotQuat[0] = XMQuaternionRotationMatrix(matTemp); + return true; +} + +#undef XM3_DECOMP_EPSILON +#undef XM3RANKDECOMPOSE + +//------------------------------------------------------------------------------ +// Transformation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixIdentity() +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMMATRIX M; + M.r[0] = g_XMIdentityR0.v; + M.r[1] = g_XMIdentityR1.v; + M.r[2] = g_XMIdentityR2.v; + M.r[3] = g_XMIdentityR3.v; + return M; + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixSet +( + float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33 +) +{ + XMMATRIX M; +#if defined(_XM_NO_INTRINSICS_) + M.m[0][0] = m00; M.m[0][1] = m01; M.m[0][2] = m02; M.m[0][3] = m03; + M.m[1][0] = m10; M.m[1][1] = m11; M.m[1][2] = m12; M.m[1][3] = m13; + M.m[2][0] = m20; M.m[2][1] = m21; M.m[2][2] = m22; M.m[2][3] = m23; + M.m[3][0] = m30; M.m[3][1] = m31; M.m[3][2] = m32; M.m[3][3] = m33; +#else + M.r[0] = XMVectorSet(m00, m01, m02, m03); + M.r[1] = XMVectorSet(m10, m11, m12, m13); + M.r[2] = XMVectorSet(m20, m21, m22, m23); + M.r[3] = XMVectorSet(m30, m31, m32, m33); +#endif + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixTranslation +( + float OffsetX, + float OffsetY, + float OffsetZ +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.m[0][0] = 1.0f; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 1.0f; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = 1.0f; + M.m[2][3] = 0.0f; + + M.m[3][0] = OffsetX; + M.m[3][1] = OffsetY; + M.m[3][2] = OffsetZ; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; + M.r[0] = g_XMIdentityR0.v; + M.r[1] = g_XMIdentityR1.v; + M.r[2] = g_XMIdentityR2.v; + M.r[3] = XMVectorSet(OffsetX, OffsetY, OffsetZ, 1.f ); + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixTranslationFromVector +( + FXMVECTOR Offset +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.m[0][0] = 1.0f; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 1.0f; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = 1.0f; + M.m[2][3] = 0.0f; + + M.m[3][0] = Offset.vector4_f32[0]; + M.m[3][1] = Offset.vector4_f32[1]; + M.m[3][2] = Offset.vector4_f32[2]; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; + M.r[0] = g_XMIdentityR0.v; + M.r[1] = g_XMIdentityR1.v; + M.r[2] = g_XMIdentityR2.v; + M.r[3] = XMVectorSelect( g_XMIdentityR3.v, Offset, g_XMSelect1110.v ); + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixScaling +( + float ScaleX, + float ScaleY, + float ScaleZ +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.m[0][0] = ScaleX; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = ScaleY; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = ScaleZ; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + const XMVECTOR Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32( ScaleX, Zero, 0 ); + M.r[1] = vsetq_lane_f32( ScaleY, Zero, 1 ); + M.r[2] = vsetq_lane_f32( ScaleZ, Zero, 2 ); + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_set_ps( 0, 0, 0, ScaleX ); + M.r[1] = _mm_set_ps( 0, 0, ScaleY, 0 ); + M.r[2] = _mm_set_ps( 0, ScaleZ, 0, 0 ); + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixScalingFromVector +( + FXMVECTOR Scale +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.m[0][0] = Scale.vector4_f32[0]; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = Scale.vector4_f32[1]; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = Scale.vector4_f32[2]; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; + M.r[0] = vandq_u32(Scale,g_XMMaskX); + M.r[1] = vandq_u32(Scale,g_XMMaskY); + M.r[2] = vandq_u32(Scale,g_XMMaskZ); + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_and_ps(Scale,g_XMMaskX); + M.r[1] = _mm_and_ps(Scale,g_XMMaskY); + M.r[2] = _mm_and_ps(Scale,g_XMMaskZ); + M.r[3] = g_XMIdentityR3.v; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixRotationX +( + float Angle +) +{ +#if defined(_XM_NO_INTRINSICS_) + + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + XMMATRIX M; + M.m[0][0] = 1.0f; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = fCosAngle; + M.m[1][2] = fSinAngle; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = -fSinAngle; + M.m[2][2] = fCosAngle; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + const XMVECTOR Zero = vdupq_n_f32(0); + + XMVECTOR T1 = vsetq_lane_f32( fCosAngle, Zero, 1 ); + T1 = vsetq_lane_f32( fSinAngle, T1, 2 ); + + XMVECTOR T2 = vsetq_lane_f32( -fSinAngle, Zero, 1 ); + T2 = vsetq_lane_f32( fCosAngle, T2, 2 ); + + XMMATRIX M; + M.r[0] = g_XMIdentityR0.v; + M.r[1] = T1; + M.r[2] = T2; + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + float SinAngle; + float CosAngle; + XMScalarSinCos(&SinAngle, &CosAngle, Angle); + + XMVECTOR vSin = _mm_set_ss(SinAngle); + XMVECTOR vCos = _mm_set_ss(CosAngle); + // x = 0,y = cos,z = sin, w = 0 + vCos = _mm_shuffle_ps(vCos,vSin,_MM_SHUFFLE(3,0,0,3)); + XMMATRIX M; + M.r[0] = g_XMIdentityR0; + M.r[1] = vCos; + // x = 0,y = sin,z = cos, w = 0 + vCos = XM_PERMUTE_PS(vCos,_MM_SHUFFLE(3,1,2,0)); + // x = 0,y = -sin,z = cos, w = 0 + vCos = _mm_mul_ps(vCos,g_XMNegateY); + M.r[2] = vCos; + M.r[3] = g_XMIdentityR3; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixRotationY +( + float Angle +) +{ +#if defined(_XM_NO_INTRINSICS_) + + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + XMMATRIX M; + M.m[0][0] = fCosAngle; + M.m[0][1] = 0.0f; + M.m[0][2] = -fSinAngle; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 1.0f; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = fSinAngle; + M.m[2][1] = 0.0f; + M.m[2][2] = fCosAngle; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + const XMVECTOR Zero = vdupq_n_f32(0); + + XMVECTOR T0 = vsetq_lane_f32( fCosAngle, Zero, 0 ); + T0 = vsetq_lane_f32( -fSinAngle, T0, 2 ); + + XMVECTOR T2 = vsetq_lane_f32( fSinAngle, Zero, 0 ); + T2 = vsetq_lane_f32( fCosAngle, T2, 2 ); + + XMMATRIX M; + M.r[0] = T0; + M.r[1] = g_XMIdentityR1.v; + M.r[2] = T2; + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + float SinAngle; + float CosAngle; + XMScalarSinCos(&SinAngle, &CosAngle, Angle); + + XMVECTOR vSin = _mm_set_ss(SinAngle); + XMVECTOR vCos = _mm_set_ss(CosAngle); + // x = sin,y = 0,z = cos, w = 0 + vSin = _mm_shuffle_ps(vSin,vCos,_MM_SHUFFLE(3,0,3,0)); + XMMATRIX M; + M.r[2] = vSin; + M.r[1] = g_XMIdentityR1; + // x = cos,y = 0,z = sin, w = 0 + vSin = XM_PERMUTE_PS(vSin,_MM_SHUFFLE(3,0,1,2)); + // x = cos,y = 0,z = -sin, w = 0 + vSin = _mm_mul_ps(vSin,g_XMNegateZ); + M.r[0] = vSin; + M.r[3] = g_XMIdentityR3; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixRotationZ +( + float Angle +) +{ +#if defined(_XM_NO_INTRINSICS_) + + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + XMMATRIX M; + M.m[0][0] = fCosAngle; + M.m[0][1] = fSinAngle; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = -fSinAngle; + M.m[1][1] = fCosAngle; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = 1.0f; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = 0.0f; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + const XMVECTOR Zero = vdupq_n_f32(0); + + XMVECTOR T0 = vsetq_lane_f32( fCosAngle, Zero, 0 ); + T0 = vsetq_lane_f32( fSinAngle, T0, 1 ); + + XMVECTOR T1 = vsetq_lane_f32( -fSinAngle, Zero, 0 ); + T1 = vsetq_lane_f32( fCosAngle, T1, 1 ); + + XMMATRIX M; + M.r[0] = T0; + M.r[1] = T1; + M.r[2] = g_XMIdentityR2.v; + M.r[3] = g_XMIdentityR3.v; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + float SinAngle; + float CosAngle; + XMScalarSinCos(&SinAngle, &CosAngle, Angle); + + XMVECTOR vSin = _mm_set_ss(SinAngle); + XMVECTOR vCos = _mm_set_ss(CosAngle); + // x = cos,y = sin,z = 0, w = 0 + vCos = _mm_unpacklo_ps(vCos,vSin); + XMMATRIX M; + M.r[0] = vCos; + // x = sin,y = cos,z = 0, w = 0 + vCos = XM_PERMUTE_PS(vCos,_MM_SHUFFLE(3,2,0,1)); + // x = cos,y = -sin,z = 0, w = 0 + vCos = _mm_mul_ps(vCos,g_XMNegateX); + M.r[1] = vCos; + M.r[2] = g_XMIdentityR2; + M.r[3] = g_XMIdentityR3; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixRotationRollPitchYaw +( + float Pitch, + float Yaw, + float Roll +) +{ + XMVECTOR Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f); + return XMMatrixRotationRollPitchYawFromVector(Angles); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixRotationRollPitchYawFromVector +( + FXMVECTOR Angles // <Pitch, Yaw, Roll, undefined> +) +{ + XMVECTOR Q = XMQuaternionRotationRollPitchYawFromVector(Angles); + return XMMatrixRotationQuaternion(Q); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixRotationNormal +( + FXMVECTOR NormalAxis, + float Angle +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + XMVECTOR A = XMVectorSet(fSinAngle, fCosAngle, 1.0f - fCosAngle, 0.0f); + + XMVECTOR C2 = XMVectorSplatZ(A); + XMVECTOR C1 = XMVectorSplatY(A); + XMVECTOR C0 = XMVectorSplatX(A); + + XMVECTOR N0 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_W>(NormalAxis); + XMVECTOR N1 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_W>(NormalAxis); + + XMVECTOR V0 = XMVectorMultiply(C2, N0); + V0 = XMVectorMultiply(V0, N1); + + XMVECTOR R0 = XMVectorMultiply(C2, NormalAxis); + R0 = XMVectorMultiplyAdd(R0, NormalAxis, C1); + + XMVECTOR R1 = XMVectorMultiplyAdd(C0, NormalAxis, V0); + XMVECTOR R2 = XMVectorNegativeMultiplySubtract(C0, NormalAxis, V0); + + V0 = XMVectorSelect(A, R0, g_XMSelect1110.v); + XMVECTOR V1 = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_1Y, XM_PERMUTE_1Z, XM_PERMUTE_0X>(R1, R2); + XMVECTOR V2 = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_0Y, XM_PERMUTE_1X>(R1, R2); + + XMMATRIX M; + M.r[0] = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0W>(V0, V1); + M.r[1] = XMVectorPermute<XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_1W, XM_PERMUTE_0W>(V0, V1); + M.r[2] = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0Z, XM_PERMUTE_0W>(V0, V2); + M.r[3] = g_XMIdentityR3.v; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + float fSinAngle; + float fCosAngle; + XMScalarSinCos(&fSinAngle, &fCosAngle, Angle); + + XMVECTOR C2 = _mm_set_ps1(1.0f - fCosAngle); + XMVECTOR C1 = _mm_set_ps1(fCosAngle); + XMVECTOR C0 = _mm_set_ps1(fSinAngle); + + XMVECTOR N0 = XM_PERMUTE_PS(NormalAxis,_MM_SHUFFLE(3,0,2,1)); + XMVECTOR N1 = XM_PERMUTE_PS(NormalAxis,_MM_SHUFFLE(3,1,0,2)); + + XMVECTOR V0 = _mm_mul_ps(C2, N0); + V0 = _mm_mul_ps(V0, N1); + + XMVECTOR R0 = _mm_mul_ps(C2, NormalAxis); + R0 = _mm_mul_ps(R0, NormalAxis); + R0 = _mm_add_ps(R0, C1); + + XMVECTOR R1 = _mm_mul_ps(C0, NormalAxis); + R1 = _mm_add_ps(R1, V0); + XMVECTOR R2 = _mm_mul_ps(C0, NormalAxis); + R2 = _mm_sub_ps(V0,R2); + + V0 = _mm_and_ps(R0,g_XMMask3); + XMVECTOR V1 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(2,1,2,0)); + V1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(0,3,2,1)); + XMVECTOR V2 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(0,0,1,1)); + V2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(2,0,2,0)); + + R2 = _mm_shuffle_ps(V0,V1,_MM_SHUFFLE(1,0,3,0)); + R2 = XM_PERMUTE_PS(R2,_MM_SHUFFLE(1,3,2,0)); + + XMMATRIX M; + M.r[0] = R2; + + R2 = _mm_shuffle_ps(V0,V1,_MM_SHUFFLE(3,2,3,1)); + R2 = XM_PERMUTE_PS(R2,_MM_SHUFFLE(1,3,0,2)); + M.r[1] = R2; + + V2 = _mm_shuffle_ps(V2,V0,_MM_SHUFFLE(3,2,1,0)); + M.r[2] = V2; + M.r[3] = g_XMIdentityR3.v; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixRotationAxis +( + FXMVECTOR Axis, + float Angle +) +{ + assert(!XMVector3Equal(Axis, XMVectorZero())); + assert(!XMVector3IsInfinite(Axis)); + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR Normal = XMVector3Normalize(Axis); + return XMMatrixRotationNormal(Normal, Angle); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixRotationQuaternion +( + FXMVECTOR Quaternion +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 Constant1110 = {1.0f, 1.0f, 1.0f, 0.0f}; + + XMVECTOR Q0 = XMVectorAdd(Quaternion, Quaternion); + XMVECTOR Q1 = XMVectorMultiply(Quaternion, Q0); + + XMVECTOR V0 = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_1W>(Q1, Constant1110.v); + XMVECTOR V1 = XMVectorPermute<XM_PERMUTE_0Z, XM_PERMUTE_0Z, XM_PERMUTE_0Y, XM_PERMUTE_1W>(Q1, Constant1110.v); + XMVECTOR R0 = XMVectorSubtract(Constant1110, V0); + R0 = XMVectorSubtract(R0, V1); + + V0 = XMVectorSwizzle<XM_SWIZZLE_X, XM_SWIZZLE_X, XM_SWIZZLE_Y, XM_SWIZZLE_W>(Quaternion); + V1 = XMVectorSwizzle<XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_W>(Q0); + V0 = XMVectorMultiply(V0, V1); + + V1 = XMVectorSplatW(Quaternion); + XMVECTOR V2 = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_X, XM_SWIZZLE_W>(Q0); + V1 = XMVectorMultiply(V1, V2); + + XMVECTOR R1 = XMVectorAdd(V0, V1); + XMVECTOR R2 = XMVectorSubtract(V0, V1); + + V0 = XMVectorPermute<XM_PERMUTE_0Y, XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0Z>(R1, R2); + V1 = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1Z, XM_PERMUTE_0X, XM_PERMUTE_1Z>(R1, R2); + + XMMATRIX M; + M.r[0] = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0W>(R0, V0); + M.r[1] = XMVectorPermute<XM_PERMUTE_1Z, XM_PERMUTE_0Y, XM_PERMUTE_1W, XM_PERMUTE_0W>(R0, V0); + M.r[2] = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_1Y, XM_PERMUTE_0Z, XM_PERMUTE_0W>(R0, V1); + M.r[3] = g_XMIdentityR3.v; + return M; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Constant1110 = {1.0f, 1.0f, 1.0f, 0.0f}; + + XMVECTOR Q0 = _mm_add_ps(Quaternion,Quaternion); + XMVECTOR Q1 = _mm_mul_ps(Quaternion,Q0); + + XMVECTOR V0 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(3,0,0,1)); + V0 = _mm_and_ps(V0,g_XMMask3); + XMVECTOR V1 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(3,1,2,2)); + V1 = _mm_and_ps(V1,g_XMMask3); + XMVECTOR R0 = _mm_sub_ps(Constant1110,V0); + R0 = _mm_sub_ps(R0, V1); + + V0 = XM_PERMUTE_PS(Quaternion,_MM_SHUFFLE(3,1,0,0)); + V1 = XM_PERMUTE_PS(Q0,_MM_SHUFFLE(3,2,1,2)); + V0 = _mm_mul_ps(V0, V1); + + V1 = XM_PERMUTE_PS(Quaternion,_MM_SHUFFLE(3,3,3,3)); + XMVECTOR V2 = XM_PERMUTE_PS(Q0,_MM_SHUFFLE(3,0,2,1)); + V1 = _mm_mul_ps(V1, V2); + + XMVECTOR R1 = _mm_add_ps(V0, V1); + XMVECTOR R2 = _mm_sub_ps(V0, V1); + + V0 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(1,0,2,1)); + V0 = XM_PERMUTE_PS(V0,_MM_SHUFFLE(1,3,2,0)); + V1 = _mm_shuffle_ps(R1,R2,_MM_SHUFFLE(2,2,0,0)); + V1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(2,0,2,0)); + + Q1 = _mm_shuffle_ps(R0,V0,_MM_SHUFFLE(1,0,3,0)); + Q1 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(1,3,2,0)); + + XMMATRIX M; + M.r[0] = Q1; + + Q1 = _mm_shuffle_ps(R0,V0,_MM_SHUFFLE(3,2,3,1)); + Q1 = XM_PERMUTE_PS(Q1,_MM_SHUFFLE(1,3,0,2)); + M.r[1] = Q1; + + Q1 = _mm_shuffle_ps(V1,R0,_MM_SHUFFLE(3,2,1,0)); + M.r[2] = Q1; + M.r[3] = g_XMIdentityR3; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixTransformation2D +( + FXMVECTOR ScalingOrigin, + float ScalingOrientation, + FXMVECTOR Scaling, + FXMVECTOR RotationOrigin, + float Rotation, + GXMVECTOR Translation +) +{ + // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling * MScalingOrientation * + // MScalingOrigin * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + XMVECTOR VScalingOrigin = XMVectorSelect(g_XMSelect1100.v, ScalingOrigin, g_XMSelect1100.v); + XMVECTOR NegScalingOrigin = XMVectorNegate(VScalingOrigin); + + XMMATRIX MScalingOriginI = XMMatrixTranslationFromVector(NegScalingOrigin); + XMMATRIX MScalingOrientation = XMMatrixRotationZ(ScalingOrientation); + XMMATRIX MScalingOrientationT = XMMatrixTranspose(MScalingOrientation); + XMVECTOR VScaling = XMVectorSelect(g_XMOne.v, Scaling, g_XMSelect1100.v); + XMMATRIX MScaling = XMMatrixScalingFromVector(VScaling); + XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1100.v, RotationOrigin, g_XMSelect1100.v); + XMMATRIX MRotation = XMMatrixRotationZ(Rotation); + XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1100.v, Translation,g_XMSelect1100.v); + + XMMATRIX M = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT); + M = XMMatrixMultiply(M, MScaling); + M = XMMatrixMultiply(M, MScalingOrientation); + M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin); + M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); + M.r[3] = XMVectorAdd(M.r[3], VTranslation); + + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixTransformation +( + FXMVECTOR ScalingOrigin, + FXMVECTOR ScalingOrientationQuaternion, + FXMVECTOR Scaling, + GXMVECTOR RotationOrigin, + CXMVECTOR RotationQuaternion, + CXMVECTOR Translation +) +{ + // M = Inverse(MScalingOrigin) * Transpose(MScalingOrientation) * MScaling * MScalingOrientation * + // MScalingOrigin * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + XMVECTOR VScalingOrigin = XMVectorSelect(g_XMSelect1110.v, ScalingOrigin, g_XMSelect1110.v); + XMVECTOR NegScalingOrigin = XMVectorNegate(ScalingOrigin); + + XMMATRIX MScalingOriginI = XMMatrixTranslationFromVector(NegScalingOrigin); + XMMATRIX MScalingOrientation = XMMatrixRotationQuaternion(ScalingOrientationQuaternion); + XMMATRIX MScalingOrientationT = XMMatrixTranspose(MScalingOrientation); + XMMATRIX MScaling = XMMatrixScalingFromVector(Scaling); + XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1110.v, RotationOrigin, g_XMSelect1110.v); + XMMATRIX MRotation = XMMatrixRotationQuaternion(RotationQuaternion); + XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1110.v, Translation, g_XMSelect1110.v); + + XMMATRIX M; + M = XMMatrixMultiply(MScalingOriginI, MScalingOrientationT); + M = XMMatrixMultiply(M, MScaling); + M = XMMatrixMultiply(M, MScalingOrientation); + M.r[3] = XMVectorAdd(M.r[3], VScalingOrigin); + M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); + M.r[3] = XMVectorAdd(M.r[3], VTranslation); + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixAffineTransformation2D +( + FXMVECTOR Scaling, + FXMVECTOR RotationOrigin, + float Rotation, + FXMVECTOR Translation +) +{ + // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + XMVECTOR VScaling = XMVectorSelect(g_XMOne.v, Scaling, g_XMSelect1100.v); + XMMATRIX MScaling = XMMatrixScalingFromVector(VScaling); + XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1100.v, RotationOrigin, g_XMSelect1100.v); + XMMATRIX MRotation = XMMatrixRotationZ(Rotation); + XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1100.v, Translation,g_XMSelect1100.v); + + XMMATRIX M; + M = MScaling; + M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); + M.r[3] = XMVectorAdd(M.r[3], VTranslation); + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixAffineTransformation +( + FXMVECTOR Scaling, + FXMVECTOR RotationOrigin, + FXMVECTOR RotationQuaternion, + GXMVECTOR Translation +) +{ + // M = MScaling * Inverse(MRotationOrigin) * MRotation * MRotationOrigin * MTranslation; + + XMMATRIX MScaling = XMMatrixScalingFromVector(Scaling); + XMVECTOR VRotationOrigin = XMVectorSelect(g_XMSelect1110.v, RotationOrigin,g_XMSelect1110.v); + XMMATRIX MRotation = XMMatrixRotationQuaternion(RotationQuaternion); + XMVECTOR VTranslation = XMVectorSelect(g_XMSelect1110.v, Translation,g_XMSelect1110.v); + + XMMATRIX M; + M = MScaling; + M.r[3] = XMVectorSubtract(M.r[3], VRotationOrigin); + M = XMMatrixMultiply(M, MRotation); + M.r[3] = XMVectorAdd(M.r[3], VRotationOrigin); + M.r[3] = XMVectorAdd(M.r[3], VTranslation); + return M; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixReflect +( + FXMVECTOR ReflectionPlane +) +{ + assert(!XMVector3Equal(ReflectionPlane, XMVectorZero())); + assert(!XMPlaneIsInfinite(ReflectionPlane)); + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 NegativeTwo = {-2.0f, -2.0f, -2.0f, 0.0f}; + + XMVECTOR P = XMPlaneNormalize(ReflectionPlane); + XMVECTOR S = XMVectorMultiply(P, NegativeTwo); + + XMVECTOR A = XMVectorSplatX(P); + XMVECTOR B = XMVectorSplatY(P); + XMVECTOR C = XMVectorSplatZ(P); + XMVECTOR D = XMVectorSplatW(P); + + XMMATRIX M; + M.r[0] = XMVectorMultiplyAdd(A, S, g_XMIdentityR0.v); + M.r[1] = XMVectorMultiplyAdd(B, S, g_XMIdentityR1.v); + M.r[2] = XMVectorMultiplyAdd(C, S, g_XMIdentityR2.v); + M.r[3] = XMVectorMultiplyAdd(D, S, g_XMIdentityR3.v); + return M; + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixShadow +( + FXMVECTOR ShadowPlane, + FXMVECTOR LightPosition +) +{ + static const XMVECTORU32 Select0001 = {XM_SELECT_0, XM_SELECT_0, XM_SELECT_0, XM_SELECT_1}; + + assert(!XMVector3Equal(ShadowPlane, XMVectorZero())); + assert(!XMPlaneIsInfinite(ShadowPlane)); + + XMVECTOR P = XMPlaneNormalize(ShadowPlane); + XMVECTOR Dot = XMPlaneDot(P, LightPosition); + P = XMVectorNegate(P); + XMVECTOR D = XMVectorSplatW(P); + XMVECTOR C = XMVectorSplatZ(P); + XMVECTOR B = XMVectorSplatY(P); + XMVECTOR A = XMVectorSplatX(P); + Dot = XMVectorSelect(Select0001.v, Dot, Select0001.v); + + XMMATRIX M; + M.r[3] = XMVectorMultiplyAdd(D, LightPosition, Dot); + Dot = XMVectorRotateLeft(Dot, 1); + M.r[2] = XMVectorMultiplyAdd(C, LightPosition, Dot); + Dot = XMVectorRotateLeft(Dot, 1); + M.r[1] = XMVectorMultiplyAdd(B, LightPosition, Dot); + Dot = XMVectorRotateLeft(Dot, 1); + M.r[0] = XMVectorMultiplyAdd(A, LightPosition, Dot); + return M; +} + +//------------------------------------------------------------------------------ +// View and projection initialization operations +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixLookAtLH +( + FXMVECTOR EyePosition, + FXMVECTOR FocusPosition, + FXMVECTOR UpDirection +) +{ + XMVECTOR EyeDirection = XMVectorSubtract(FocusPosition, EyePosition); + return XMMatrixLookToLH(EyePosition, EyeDirection, UpDirection); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixLookAtRH +( + FXMVECTOR EyePosition, + FXMVECTOR FocusPosition, + FXMVECTOR UpDirection +) +{ + XMVECTOR NegEyeDirection = XMVectorSubtract(EyePosition, FocusPosition); + return XMMatrixLookToLH(EyePosition, NegEyeDirection, UpDirection); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixLookToLH +( + FXMVECTOR EyePosition, + FXMVECTOR EyeDirection, + FXMVECTOR UpDirection +) +{ + assert(!XMVector3Equal(EyeDirection, XMVectorZero())); + assert(!XMVector3IsInfinite(EyeDirection)); + assert(!XMVector3Equal(UpDirection, XMVectorZero())); + assert(!XMVector3IsInfinite(UpDirection)); + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR R2 = XMVector3Normalize(EyeDirection); + + XMVECTOR R0 = XMVector3Cross(UpDirection, R2); + R0 = XMVector3Normalize(R0); + + XMVECTOR R1 = XMVector3Cross(R2, R0); + + XMVECTOR NegEyePosition = XMVectorNegate(EyePosition); + + XMVECTOR D0 = XMVector3Dot(R0, NegEyePosition); + XMVECTOR D1 = XMVector3Dot(R1, NegEyePosition); + XMVECTOR D2 = XMVector3Dot(R2, NegEyePosition); + + XMMATRIX M; + M.r[0] = XMVectorSelect(D0, R0, g_XMSelect1110.v); + M.r[1] = XMVectorSelect(D1, R1, g_XMSelect1110.v); + M.r[2] = XMVectorSelect(D2, R2, g_XMSelect1110.v); + M.r[3] = g_XMIdentityR3.v; + + M = XMMatrixTranspose(M); + + return M; + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixLookToRH +( + FXMVECTOR EyePosition, + FXMVECTOR EyeDirection, + FXMVECTOR UpDirection +) +{ + XMVECTOR NegEyeDirection = XMVectorNegate(EyeDirection); + return XMMatrixLookToLH(EyePosition, NegEyeDirection, UpDirection); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixPerspectiveLH +( + float ViewWidth, + float ViewHeight, + float NearZ, + float FarZ +) +{ + assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (FarZ - NearZ); + + XMMATRIX M; + M.m[0][0] = TwoNearZ / ViewWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = TwoNearZ / ViewHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (FarZ - NearZ); + const XMVECTOR Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32( TwoNearZ / ViewWidth, Zero, 0 ); + M.r[1] = vsetq_lane_f32( TwoNearZ / ViewHeight, Zero, 1 ); + M.r[2] = vsetq_lane_f32( fRange, g_XMIdentityR3.v, 2 ); + M.r[3] = vsetq_lane_f32( -fRange * NearZ, Zero, 2 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (FarZ - NearZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + TwoNearZ / ViewWidth, + TwoNearZ / ViewHeight, + fRange, + -fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // TwoNearZ / ViewWidth,0,0,0 + M.r[0] = vTemp; + // 0,TwoNearZ / ViewHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,1.0f + vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2)); + // 0,0,fRange,1.0f + vTemp = _mm_setzero_ps(); + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0)); + M.r[2] = vTemp; + // 0,0,-fRange * NearZ,0 + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0)); + M.r[3] = vTemp; + + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixPerspectiveRH +( + float ViewWidth, + float ViewHeight, + float NearZ, + float FarZ +) +{ + assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (NearZ - FarZ); + + XMMATRIX M; + M.m[0][0] = TwoNearZ / ViewWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = TwoNearZ / ViewHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = -1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (NearZ - FarZ); + const XMVECTOR Zero = vdupq_n_f32(0); + + XMMATRIX M; + M.r[0] = vsetq_lane_f32( TwoNearZ / ViewWidth, Zero, 0 ); + M.r[1] = vsetq_lane_f32( TwoNearZ / ViewHeight, Zero, 1 ); + M.r[2] = vsetq_lane_f32( fRange, g_XMNegIdentityR3.v, 2 ); + M.r[3] = vsetq_lane_f32( fRange * NearZ, Zero, 2 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float TwoNearZ = NearZ + NearZ; + float fRange = FarZ / (NearZ-FarZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + TwoNearZ / ViewWidth, + TwoNearZ / ViewHeight, + fRange, + fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // TwoNearZ / ViewWidth,0,0,0 + M.r[0] = vTemp; + // 0,TwoNearZ / ViewHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,-1.0f + vValues = _mm_shuffle_ps(vValues,g_XMNegIdentityR3,_MM_SHUFFLE(3,2,3,2)); + // 0,0,fRange,-1.0f + vTemp = _mm_setzero_ps(); + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0)); + M.r[2] = vTemp; + // 0,0,-fRange * NearZ,0 + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0)); + M.r[3] = vTemp; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixPerspectiveFovLH +( + float FovAngleY, + float AspectHByW, + float NearZ, + float FarZ +) +{ + assert(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f)); + assert(!XMScalarNearEqual(AspectHByW, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + + float Height = CosFov / SinFov; + float Width = Height / AspectHByW; + float fRange = FarZ / (FarZ-NearZ); + + XMMATRIX M; + M.m[0][0] = Width; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = Height; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + + float fRange = FarZ / (FarZ-NearZ); + float Height = CosFov / SinFov; + float Width = Height / AspectHByW; + const XMVECTOR Zero = vdupq_n_f32(0); + + XMMATRIX M; + M.r[0] = vsetq_lane_f32( Width, Zero, 0 ); + M.r[1] = vsetq_lane_f32( Height, Zero, 1 ); + M.r[2] = vsetq_lane_f32( fRange, g_XMIdentityR3.v, 2 ); + M.r[3] = vsetq_lane_f32( -fRange * NearZ, Zero, 2 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + + float fRange = FarZ / (FarZ-NearZ); + // Note: This is recorded on the stack + float Height = CosFov / SinFov; + XMVECTOR rMem = { + Height / AspectHByW, + Height, + fRange, + -fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // CosFov / SinFov,0,0,0 + XMMATRIX M; + M.r[0] = vTemp; + // 0,Height / AspectHByW,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,1.0f + vTemp = _mm_setzero_ps(); + vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2)); + // 0,0,fRange,1.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0)); + M.r[2] = vTemp; + // 0,0,-fRange * NearZ,0.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0)); + M.r[3] = vTemp; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixPerspectiveFovRH +( + float FovAngleY, + float AspectHByW, + float NearZ, + float FarZ +) +{ + assert(!XMScalarNearEqual(FovAngleY, 0.0f, 0.00001f * 2.0f)); + assert(!XMScalarNearEqual(AspectHByW, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + + float Height = CosFov / SinFov; + float Width = Height / AspectHByW; + float fRange = FarZ / (NearZ-FarZ); + + XMMATRIX M; + M.m[0][0] = Width; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = Height; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = -1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + float fRange = FarZ / (NearZ-FarZ); + float Height = CosFov / SinFov; + float Width = Height / AspectHByW; + const XMVECTOR Zero = vdupq_n_f32(0); + + XMMATRIX M; + M.r[0] = vsetq_lane_f32( Width, Zero, 0 ); + M.r[1] = vsetq_lane_f32( Height, Zero, 1 ); + M.r[2] = vsetq_lane_f32( fRange, g_XMNegIdentityR3.v, 2 ); + M.r[3] = vsetq_lane_f32( fRange * NearZ, Zero, 2 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + float SinFov; + float CosFov; + XMScalarSinCos(&SinFov, &CosFov, 0.5f * FovAngleY); + float fRange = FarZ / (NearZ-FarZ); + // Note: This is recorded on the stack + float Height = CosFov / SinFov; + XMVECTOR rMem = { + Height / AspectHByW, + Height, + fRange, + fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // CosFov / SinFov,0,0,0 + XMMATRIX M; + M.r[0] = vTemp; + // 0,Height / AspectHByW,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,-1.0f + vTemp = _mm_setzero_ps(); + vValues = _mm_shuffle_ps(vValues,g_XMNegIdentityR3,_MM_SHUFFLE(3,2,3,2)); + // 0,0,fRange,-1.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,0,0,0)); + M.r[2] = vTemp; + // 0,0,fRange * NearZ,0.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,1,0,0)); + M.r[3] = vTemp; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixPerspectiveOffCenterLH +( + float ViewLeft, + float ViewRight, + float ViewBottom, + float ViewTop, + float NearZ, + float FarZ +) +{ + assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); + assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float TwoNearZ = NearZ + NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (FarZ-NearZ); + + XMMATRIX M; + M.m[0][0] = TwoNearZ * ReciprocalWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = TwoNearZ * ReciprocalHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = -(ViewLeft + ViewRight) * ReciprocalWidth; + M.m[2][1] = -(ViewTop + ViewBottom) * ReciprocalHeight; + M.m[2][2] = fRange; + M.m[2][3] = 1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float TwoNearZ = NearZ + NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (FarZ-NearZ); + const XMVECTOR Zero = vdupq_n_f32(0); + + XMMATRIX M; + M.r[0] = vsetq_lane_f32( TwoNearZ * ReciprocalWidth, Zero, 0 ); + M.r[1] = vsetq_lane_f32( TwoNearZ * ReciprocalHeight, Zero, 1 ); + M.r[2] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + fRange, + 1.0f); + M.r[3] = vsetq_lane_f32( -fRange * NearZ, Zero, 2 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float TwoNearZ = NearZ+NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (FarZ-NearZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + TwoNearZ*ReciprocalWidth, + TwoNearZ*ReciprocalHeight, + -fRange * NearZ, + 0 + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // TwoNearZ*ReciprocalWidth,0,0,0 + M.r[0] = vTemp; + // 0,TwoNearZ*ReciprocalHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // 0,0,fRange,1.0f + M.r[2] = XMVectorSet( -(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + fRange, + 1.0f ); + // 0,0,-fRange * NearZ,0.0f + vValues = _mm_and_ps(vValues,g_XMMaskZ); + M.r[3] = vValues; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixPerspectiveOffCenterRH +( + float ViewLeft, + float ViewRight, + float ViewBottom, + float ViewTop, + float NearZ, + float FarZ +) +{ + assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); + assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float TwoNearZ = NearZ + NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (NearZ-FarZ); + + XMMATRIX M; + M.m[0][0] = TwoNearZ * ReciprocalWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = TwoNearZ * ReciprocalHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = (ViewLeft + ViewRight) * ReciprocalWidth; + M.m[2][1] = (ViewTop + ViewBottom) * ReciprocalHeight; + M.m[2][2] = fRange; + M.m[2][3] = -1.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = fRange * NearZ; + M.m[3][3] = 0.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float TwoNearZ = NearZ + NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (NearZ-FarZ); + const XMVECTOR Zero = vdupq_n_f32(0); + + XMMATRIX M; + M.r[0] = vsetq_lane_f32( TwoNearZ * ReciprocalWidth, Zero, 0 ); + M.r[1] = vsetq_lane_f32( TwoNearZ * ReciprocalHeight, Zero, 1 ); + M.r[2] = XMVectorSet((ViewLeft + ViewRight) * ReciprocalWidth, + (ViewTop + ViewBottom) * ReciprocalHeight, + fRange, + -1.0f); + M.r[3] = vsetq_lane_f32( fRange * NearZ, Zero, 2 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float TwoNearZ = NearZ+NearZ; + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = FarZ / (NearZ-FarZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + TwoNearZ*ReciprocalWidth, + TwoNearZ*ReciprocalHeight, + fRange * NearZ, + 0 + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // TwoNearZ*ReciprocalWidth,0,0,0 + M.r[0] = vTemp; + // 0,TwoNearZ*ReciprocalHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // 0,0,fRange,1.0f + M.r[2] = XMVectorSet( (ViewLeft + ViewRight) * ReciprocalWidth, + (ViewTop + ViewBottom) * ReciprocalHeight, + fRange, + -1.0f ); + // 0,0,-fRange * NearZ,0.0f + vValues = _mm_and_ps(vValues,g_XMMaskZ); + M.r[3] = vValues; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixOrthographicLH +( + float ViewWidth, + float ViewHeight, + float NearZ, + float FarZ +) +{ + assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float fRange = 1.0f / (FarZ-NearZ); + + XMMATRIX M; + M.m[0][0] = 2.0f / ViewWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 2.0f / ViewHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fRange = 1.0f / (FarZ-NearZ); + + const XMVECTOR Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32( 2.0f / ViewWidth, Zero, 0 ); + M.r[1] = vsetq_lane_f32( 2.0f / ViewHeight, Zero, 1 ); + M.r[2] = vsetq_lane_f32( fRange, Zero, 2 ); + M.r[3] = vsetq_lane_f32( -fRange * NearZ, g_XMIdentityR3.v, 2 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float fRange = 1.0f / (FarZ-NearZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + 2.0f / ViewWidth, + 2.0f / ViewHeight, + fRange, + -fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // 2.0f / ViewWidth,0,0,0 + M.r[0] = vTemp; + // 0,2.0f / ViewHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=-fRange * NearZ,0,1.0f + vTemp = _mm_setzero_ps(); + vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2)); + // 0,0,fRange,0.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,0,0,0)); + M.r[2] = vTemp; + // 0,0,-fRange * NearZ,1.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,1,0,0)); + M.r[3] = vTemp; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixOrthographicRH +( + float ViewWidth, + float ViewHeight, + float NearZ, + float FarZ +) +{ + assert(!XMScalarNearEqual(ViewWidth, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(ViewHeight, 0.0f, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float fRange = 1.0f / (NearZ-FarZ); + + XMMATRIX M; + M.m[0][0] = 2.0f / ViewWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = 2.0f / ViewHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 0.0f; + + M.m[3][0] = 0.0f; + M.m[3][1] = 0.0f; + M.m[3][2] = fRange * NearZ; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float fRange = 1.0f / (NearZ-FarZ); + + const XMVECTOR Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32( 2.0f / ViewWidth, Zero, 0 ); + M.r[1] = vsetq_lane_f32( 2.0f / ViewHeight, Zero, 1 ); + M.r[2] = vsetq_lane_f32( fRange, Zero, 2 ); + M.r[3] = vsetq_lane_f32( fRange * NearZ, g_XMIdentityR3.v, 2 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float fRange = 1.0f / (NearZ-FarZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + 2.0f / ViewWidth, + 2.0f / ViewHeight, + fRange, + fRange * NearZ + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // 2.0f / ViewWidth,0,0,0 + M.r[0] = vTemp; + // 0,2.0f / ViewHeight,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + M.r[1] = vTemp; + // x=fRange,y=fRange * NearZ,0,1.0f + vTemp = _mm_setzero_ps(); + vValues = _mm_shuffle_ps(vValues,g_XMIdentityR3,_MM_SHUFFLE(3,2,3,2)); + // 0,0,fRange,0.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(2,0,0,0)); + M.r[2] = vTemp; + // 0,0,fRange * NearZ,1.0f + vTemp = _mm_shuffle_ps(vTemp,vValues,_MM_SHUFFLE(3,1,0,0)); + M.r[3] = vTemp; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixOrthographicOffCenterLH +( + float ViewLeft, + float ViewRight, + float ViewBottom, + float ViewTop, + float NearZ, + float FarZ +) +{ + assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); + assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (FarZ-NearZ); + + XMMATRIX M; + M.m[0][0] = ReciprocalWidth + ReciprocalWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = ReciprocalHeight + ReciprocalHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 0.0f; + + M.m[3][0] = -(ViewLeft + ViewRight) * ReciprocalWidth; + M.m[3][1] = -(ViewTop + ViewBottom) * ReciprocalHeight; + M.m[3][2] = -fRange * NearZ; + M.m[3][3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (FarZ-NearZ); + const XMVECTOR Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32( ReciprocalWidth + ReciprocalWidth, Zero, 0 ); + M.r[1] = vsetq_lane_f32( ReciprocalHeight + ReciprocalHeight, Zero, 1 ); + M.r[2] = vsetq_lane_f32( fRange, Zero, 2 ); + M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + -fRange * NearZ, + 1.0f); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float fReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float fReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (FarZ-NearZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + fReciprocalWidth, + fReciprocalHeight, + fRange, + 1.0f + }; + XMVECTOR rMem2 = { + -(ViewLeft + ViewRight), + -(ViewTop + ViewBottom), + -NearZ, + 1.0f + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // fReciprocalWidth*2,0,0,0 + vTemp = _mm_add_ss(vTemp,vTemp); + M.r[0] = vTemp; + // 0,fReciprocalHeight*2,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + vTemp = _mm_add_ps(vTemp,vTemp); + M.r[1] = vTemp; + // 0,0,fRange,0.0f + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskZ); + M.r[2] = vTemp; + // -(ViewLeft + ViewRight)*fReciprocalWidth,-(ViewTop + ViewBottom)*fReciprocalHeight,fRange*-NearZ,1.0f + vValues = _mm_mul_ps(vValues,rMem2); + M.r[3] = vValues; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMatrixOrthographicOffCenterRH +( + float ViewLeft, + float ViewRight, + float ViewBottom, + float ViewTop, + float NearZ, + float FarZ +) +{ + assert(!XMScalarNearEqual(ViewRight, ViewLeft, 0.00001f)); + assert(!XMScalarNearEqual(ViewTop, ViewBottom, 0.00001f)); + assert(!XMScalarNearEqual(FarZ, NearZ, 0.00001f)); + +#if defined(_XM_NO_INTRINSICS_) + + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (NearZ-FarZ); + + XMMATRIX M; + M.m[0][0] = ReciprocalWidth + ReciprocalWidth; + M.m[0][1] = 0.0f; + M.m[0][2] = 0.0f; + M.m[0][3] = 0.0f; + + M.m[1][0] = 0.0f; + M.m[1][1] = ReciprocalHeight + ReciprocalHeight; + M.m[1][2] = 0.0f; + M.m[1][3] = 0.0f; + + M.m[2][0] = 0.0f; + M.m[2][1] = 0.0f; + M.m[2][2] = fRange; + M.m[2][3] = 0.0f; + + M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + fRange * NearZ, + 1.0f); + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float ReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float ReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (NearZ-FarZ); + const XMVECTOR Zero = vdupq_n_f32(0); + XMMATRIX M; + M.r[0] = vsetq_lane_f32( ReciprocalWidth + ReciprocalWidth, Zero, 0 ); + M.r[1] = vsetq_lane_f32( ReciprocalHeight + ReciprocalHeight, Zero, 1 ); + M.r[2] = vsetq_lane_f32( fRange, Zero, 2 ); + M.r[3] = XMVectorSet(-(ViewLeft + ViewRight) * ReciprocalWidth, + -(ViewTop + ViewBottom) * ReciprocalHeight, + fRange * NearZ, + 1.0f); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + float fReciprocalWidth = 1.0f / (ViewRight - ViewLeft); + float fReciprocalHeight = 1.0f / (ViewTop - ViewBottom); + float fRange = 1.0f / (NearZ-FarZ); + // Note: This is recorded on the stack + XMVECTOR rMem = { + fReciprocalWidth, + fReciprocalHeight, + fRange, + 1.0f + }; + XMVECTOR rMem2 = { + -(ViewLeft + ViewRight), + -(ViewTop + ViewBottom), + NearZ, + 1.0f + }; + // Copy from memory to SSE register + XMVECTOR vValues = rMem; + XMVECTOR vTemp = _mm_setzero_ps(); + // Copy x only + vTemp = _mm_move_ss(vTemp,vValues); + // fReciprocalWidth*2,0,0,0 + vTemp = _mm_add_ss(vTemp,vTemp); + M.r[0] = vTemp; + // 0,fReciprocalHeight*2,0,0 + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskY); + vTemp = _mm_add_ps(vTemp,vTemp); + M.r[1] = vTemp; + // 0,0,fRange,0.0f + vTemp = vValues; + vTemp = _mm_and_ps(vTemp,g_XMMaskZ); + M.r[2] = vTemp; + // -(ViewLeft + ViewRight)*fReciprocalWidth,-(ViewTop + ViewBottom)*fReciprocalHeight,fRange*-NearZ,1.0f + vValues = _mm_mul_ps(vValues,rMem2); + M.r[3] = vValues; + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + + +/**************************************************************************** + * + * XMMATRIX operators and methods + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMMATRIX::XMMATRIX +( + float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33 +) +{ + r[0] = XMVectorSet(m00, m01, m02, m03); + r[1] = XMVectorSet(m10, m11, m12, m13); + r[2] = XMVectorSet(m20, m21, m22, m23); + r[3] = XMVectorSet(m30, m31, m32, m33); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX::XMMATRIX +( + const float* pArray +) +{ + assert( pArray != NULL ); + r[0] = XMLoadFloat4((const XMFLOAT4*)pArray); + r[1] = XMLoadFloat4((const XMFLOAT4*)(pArray + 4)); + r[2] = XMLoadFloat4((const XMFLOAT4*)(pArray + 8)); + r[3] = XMLoadFloat4((const XMFLOAT4*)(pArray + 12)); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMATRIX::operator- () const +{ + XMMATRIX R; + R.r[0] = XMVectorNegate( r[0] ); + R.r[1] = XMVectorNegate( r[1] ); + R.r[2] = XMVectorNegate( r[2] ); + R.r[3] = XMVectorNegate( r[3] ); + return R; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX& XMMATRIX::operator+= (CXMMATRIX M) +{ + r[0] = XMVectorAdd( r[0], M.r[0] ); + r[1] = XMVectorAdd( r[1], M.r[1] ); + r[2] = XMVectorAdd( r[2], M.r[2] ); + r[3] = XMVectorAdd( r[3], M.r[3] ); + return *this; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX& XMMATRIX::operator-= (CXMMATRIX M) +{ + r[0] = XMVectorSubtract( r[0], M.r[0] ); + r[1] = XMVectorSubtract( r[1], M.r[1] ); + r[2] = XMVectorSubtract( r[2], M.r[2] ); + r[3] = XMVectorSubtract( r[3], M.r[3] ); + return *this; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX& XMMATRIX::operator*=(CXMMATRIX M) +{ + *this = XMMatrixMultiply( *this, M ); + return *this; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX& XMMATRIX::operator*= (float S) +{ + r[0] = XMVectorScale( r[0], S ); + r[1] = XMVectorScale( r[1], S ); + r[2] = XMVectorScale( r[2], S ); + r[3] = XMVectorScale( r[3], S ); + return *this; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX& XMMATRIX::operator/= (float S) +{ + assert( S != 0.0f ); + float t = 1.0f / S; + r[0] = XMVectorScale( r[0], t ); + r[1] = XMVectorScale( r[1], t ); + r[2] = XMVectorScale( r[2], t ); + r[3] = XMVectorScale( r[3], t ); + return *this; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMATRIX::operator+ (CXMMATRIX M) const +{ + XMMATRIX R; + R.r[0] = XMVectorAdd( r[0], M.r[0] ); + R.r[1] = XMVectorAdd( r[1], M.r[1] ); + R.r[2] = XMVectorAdd( r[2], M.r[2] ); + R.r[3] = XMVectorAdd( r[3], M.r[3] ); + return R; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMATRIX::operator- (CXMMATRIX M) const +{ + XMMATRIX R; + R.r[0] = XMVectorSubtract( r[0], M.r[0] ); + R.r[1] = XMVectorSubtract( r[1], M.r[1] ); + R.r[2] = XMVectorSubtract( r[2], M.r[2] ); + R.r[3] = XMVectorSubtract( r[3], M.r[3] ); + return R; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMATRIX::operator*(CXMMATRIX M) const +{ + return XMMatrixMultiply(*this, M); +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMATRIX::operator* (float S) const +{ + XMMATRIX R; + R.r[0] = XMVectorScale( r[0], S ); + R.r[1] = XMVectorScale( r[1], S ); + R.r[2] = XMVectorScale( r[2], S ); + R.r[3] = XMVectorScale( r[3], S ); + return R; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX XMMATRIX::operator/ (float S) const +{ + assert( S != 0.0f ); + XMMATRIX R; + float t = 1.0f / S; + R.r[0] = XMVectorScale( r[0], t ); + R.r[1] = XMVectorScale( r[1], t ); + R.r[2] = XMVectorScale( r[2], t ); + R.r[3] = XMVectorScale( r[3], t ); + return R; +} + +//------------------------------------------------------------------------------ + +inline XMMATRIX operator* +( + float S, + CXMMATRIX M +) +{ + XMMATRIX R; + R.r[0] = XMVectorScale( M.r[0], S ); + R.r[1] = XMVectorScale( M.r[1], S ); + R.r[2] = XMVectorScale( M.r[2], S ); + R.r[3] = XMVectorScale( M.r[3], S ); + return R; +} + +/**************************************************************************** + * + * XMFLOAT3X3 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMFLOAT3X3::XMFLOAT3X3 +( + float m00, float m01, float m02, + float m10, float m11, float m12, + float m20, float m21, float m22 +) +{ + m[0][0] = m00; + m[0][1] = m01; + m[0][2] = m02; + + m[1][0] = m10; + m[1][1] = m11; + m[1][2] = m12; + + m[2][0] = m20; + m[2][1] = m21; + m[2][2] = m22; +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT3X3::XMFLOAT3X3 +( + const float* pArray +) +{ + assert( pArray != NULL ); + for (size_t Row = 0; Row < 3; Row++) + { + for (size_t Column = 0; Column < 3; Column++) + { + m[Row][Column] = pArray[Row * 3 + Column]; + } + } +} + +//------------------------------------------------------------------------------ + +inline XMFLOAT3X3& XMFLOAT3X3::operator= +( + const XMFLOAT3X3& Float3x3 +) +{ + _11 = Float3x3._11; + _12 = Float3x3._12; + _13 = Float3x3._13; + _21 = Float3x3._21; + _22 = Float3x3._22; + _23 = Float3x3._23; + _31 = Float3x3._31; + _32 = Float3x3._32; + _33 = Float3x3._33; + + return *this; +} + +/**************************************************************************** + * + * XMFLOAT4X3 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMFLOAT4X3::XMFLOAT4X3 +( + float m00, float m01, float m02, + float m10, float m11, float m12, + float m20, float m21, float m22, + float m30, float m31, float m32 +) +{ + m[0][0] = m00; + m[0][1] = m01; + m[0][2] = m02; + + m[1][0] = m10; + m[1][1] = m11; + m[1][2] = m12; + + m[2][0] = m20; + m[2][1] = m21; + m[2][2] = m22; + + m[3][0] = m30; + m[3][1] = m31; + m[3][2] = m32; +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT4X3::XMFLOAT4X3 +( + const float* pArray +) +{ + assert( pArray != NULL ); + + m[0][0] = pArray[0]; + m[0][1] = pArray[1]; + m[0][2] = pArray[2]; + + m[1][0] = pArray[3]; + m[1][1] = pArray[4]; + m[1][2] = pArray[5]; + + m[2][0] = pArray[6]; + m[2][1] = pArray[7]; + m[2][2] = pArray[8]; + + m[3][0] = pArray[9]; + m[3][1] = pArray[10]; + m[3][2] = pArray[11]; +} + +//------------------------------------------------------------------------------ + +inline XMFLOAT4X3& XMFLOAT4X3::operator= +( + const XMFLOAT4X3& Float4x3 +) +{ + XMVECTOR V1 = XMLoadFloat4((const XMFLOAT4*)&Float4x3._11); + XMVECTOR V2 = XMLoadFloat4((const XMFLOAT4*)&Float4x3._22); + XMVECTOR V3 = XMLoadFloat4((const XMFLOAT4*)&Float4x3._33); + + XMStoreFloat4((XMFLOAT4*)&_11, V1); + XMStoreFloat4((XMFLOAT4*)&_22, V2); + XMStoreFloat4((XMFLOAT4*)&_33, V3); + + return *this; +} + +//------------------------------------------------------------------------------ + +inline XMFLOAT4X3A& XMFLOAT4X3A::operator= +( + const XMFLOAT4X3A& Float4x3 +) +{ + XMVECTOR V1 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x3._11); + XMVECTOR V2 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x3._22); + XMVECTOR V3 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x3._33); + + XMStoreFloat4A((XMFLOAT4A*)&_11, V1); + XMStoreFloat4A((XMFLOAT4A*)&_22, V2); + XMStoreFloat4A((XMFLOAT4A*)&_33, V3); + + return *this; +} + +/**************************************************************************** + * + * XMFLOAT4X4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMFLOAT4X4::XMFLOAT4X4 +( + float m00, float m01, float m02, float m03, + float m10, float m11, float m12, float m13, + float m20, float m21, float m22, float m23, + float m30, float m31, float m32, float m33 +) +{ + m[0][0] = m00; + m[0][1] = m01; + m[0][2] = m02; + m[0][3] = m03; + + m[1][0] = m10; + m[1][1] = m11; + m[1][2] = m12; + m[1][3] = m13; + + m[2][0] = m20; + m[2][1] = m21; + m[2][2] = m22; + m[2][3] = m23; + + m[3][0] = m30; + m[3][1] = m31; + m[3][2] = m32; + m[3][3] = m33; +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT4X4::XMFLOAT4X4 +( + const float* pArray +) +{ + assert( pArray != NULL ); + + m[0][0] = pArray[0]; + m[0][1] = pArray[1]; + m[0][2] = pArray[2]; + m[0][3] = pArray[3]; + + m[1][0] = pArray[4]; + m[1][1] = pArray[5]; + m[1][2] = pArray[6]; + m[1][3] = pArray[7]; + + m[2][0] = pArray[8]; + m[2][1] = pArray[9]; + m[2][2] = pArray[10]; + m[2][3] = pArray[11]; + + m[3][0] = pArray[12]; + m[3][1] = pArray[13]; + m[3][2] = pArray[14]; + m[3][3] = pArray[15]; +} + +//------------------------------------------------------------------------------ + +inline XMFLOAT4X4& XMFLOAT4X4::operator= +( + const XMFLOAT4X4& Float4x4 +) +{ + XMVECTOR V1 = XMLoadFloat4((const XMFLOAT4*)&Float4x4._11); + XMVECTOR V2 = XMLoadFloat4((const XMFLOAT4*)&Float4x4._21); + XMVECTOR V3 = XMLoadFloat4((const XMFLOAT4*)&Float4x4._31); + XMVECTOR V4 = XMLoadFloat4((const XMFLOAT4*)&Float4x4._41); + + XMStoreFloat4((XMFLOAT4*)&_11, V1); + XMStoreFloat4((XMFLOAT4*)&_21, V2); + XMStoreFloat4((XMFLOAT4*)&_31, V3); + XMStoreFloat4((XMFLOAT4*)&_41, V4); + + return *this; +} + +//------------------------------------------------------------------------------ + +inline XMFLOAT4X4A& XMFLOAT4X4A::operator= +( + const XMFLOAT4X4A& Float4x4 +) +{ + XMVECTOR V1 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x4._11); + XMVECTOR V2 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x4._21); + XMVECTOR V3 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x4._31); + XMVECTOR V4 = XMLoadFloat4A((const XMFLOAT4A*)&Float4x4._41); + + XMStoreFloat4A((XMFLOAT4A*)&_11, V1); + XMStoreFloat4A((XMFLOAT4A*)&_21, V2); + XMStoreFloat4A((XMFLOAT4A*)&_31, V3); + XMStoreFloat4A((XMFLOAT4A*)&_41, V4); + + return *this; +} + diff --git a/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXMathMisc.inl b/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXMathMisc.inl new file mode 100644 index 00000000..f3461e6c --- /dev/null +++ b/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXMathMisc.inl @@ -0,0 +1,2501 @@ +//------------------------------------------------------------------------------------- +// DirectXMathMisc.inl -- SIMD C++ Math library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------------------------------- + +#ifdef _MSC_VER +#pragma once +#endif + +/**************************************************************************** + * + * Quaternion + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline bool XMQuaternionEqual +( + FXMVECTOR Q1, + FXMVECTOR Q2 +) +{ + return XMVector4Equal(Q1, Q2); +} + +//------------------------------------------------------------------------------ + +inline bool XMQuaternionNotEqual +( + FXMVECTOR Q1, + FXMVECTOR Q2 +) +{ + return XMVector4NotEqual(Q1, Q2); +} + +//------------------------------------------------------------------------------ + +inline bool XMQuaternionIsNaN +( + FXMVECTOR Q +) +{ + return XMVector4IsNaN(Q); +} + +//------------------------------------------------------------------------------ + +inline bool XMQuaternionIsInfinite +( + FXMVECTOR Q +) +{ + return XMVector4IsInfinite(Q); +} + +//------------------------------------------------------------------------------ + +inline bool XMQuaternionIsIdentity +( + FXMVECTOR Q +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + return XMVector4Equal(Q, g_XMIdentityR3.v); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMQuaternionDot +( + FXMVECTOR Q1, + FXMVECTOR Q2 +) +{ + return XMVector4Dot(Q1, Q2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMQuaternionMultiply +( + FXMVECTOR Q1, + FXMVECTOR Q2 +) +{ + // Returns the product Q2*Q1 (which is the concatenation of a rotation Q1 followed by the rotation Q2) + + // [ (Q2.w * Q1.x) + (Q2.x * Q1.w) + (Q2.y * Q1.z) - (Q2.z * Q1.y), + // (Q2.w * Q1.y) - (Q2.x * Q1.z) + (Q2.y * Q1.w) + (Q2.z * Q1.x), + // (Q2.w * Q1.z) + (Q2.x * Q1.y) - (Q2.y * Q1.x) + (Q2.z * Q1.w), + // (Q2.w * Q1.w) - (Q2.x * Q1.x) - (Q2.y * Q1.y) - (Q2.z * Q1.z) ] + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result = { + (Q2.vector4_f32[3] * Q1.vector4_f32[0]) + (Q2.vector4_f32[0] * Q1.vector4_f32[3]) + (Q2.vector4_f32[1] * Q1.vector4_f32[2]) - (Q2.vector4_f32[2] * Q1.vector4_f32[1]), + (Q2.vector4_f32[3] * Q1.vector4_f32[1]) - (Q2.vector4_f32[0] * Q1.vector4_f32[2]) + (Q2.vector4_f32[1] * Q1.vector4_f32[3]) + (Q2.vector4_f32[2] * Q1.vector4_f32[0]), + (Q2.vector4_f32[3] * Q1.vector4_f32[2]) + (Q2.vector4_f32[0] * Q1.vector4_f32[1]) - (Q2.vector4_f32[1] * Q1.vector4_f32[0]) + (Q2.vector4_f32[2] * Q1.vector4_f32[3]), + (Q2.vector4_f32[3] * Q1.vector4_f32[3]) - (Q2.vector4_f32[0] * Q1.vector4_f32[0]) - (Q2.vector4_f32[1] * Q1.vector4_f32[1]) - (Q2.vector4_f32[2] * Q1.vector4_f32[2]) }; + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 ControlWZYX = { 1.0f,-1.0f, 1.0f,-1.0f}; + static const XMVECTORF32 ControlZWXY = { 1.0f, 1.0f,-1.0f,-1.0f}; + static const XMVECTORF32 ControlYXWZ = {-1.0f, 1.0f, 1.0f,-1.0f}; + + __n64 Q2L = vget_low_f32(Q2); + __n64 Q2H = vget_high_f32(Q2); + + __n128 Q2X = vdupq_lane_f32( Q2L, 0 ); + __n128 Q2Y = vdupq_lane_f32( Q2L, 1 ); + __n128 Q2Z = vdupq_lane_f32( Q2H, 0 ); + __n128 vResult = vdupq_lane_f32( Q2H, 1 ); + vResult = vmulq_f32(vResult,Q1); + + // Mul by Q1WZYX + __n128 vTemp = vrev64q_u32(Q1); + vTemp = vcombine_f32( vget_high_f32(vTemp), vget_low_f32(vTemp) ); + Q2X = vmulq_f32(Q2X,vTemp); + vResult = vmlaq_f32( vResult, Q2X, ControlWZYX ); + + // Mul by Q1ZWXY + vTemp = vrev64q_u32(vTemp); + Q2Y = vmulq_f32(Q2Y,vTemp); + vResult = vmlaq_f32(vResult, Q2Y, ControlZWXY); + + // Mul by Q1YXWZ + vTemp = vrev64q_u32(vTemp); + vTemp = vcombine_f32(vget_high_f32(vTemp), vget_low_f32(vTemp)); + Q2Z = vmulq_f32(Q2Z,vTemp); + vResult = vmlaq_f32(vResult, Q2Z, ControlYXWZ); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ControlWZYX = { 1.0f,-1.0f, 1.0f,-1.0f}; + static const XMVECTORF32 ControlZWXY = { 1.0f, 1.0f,-1.0f,-1.0f}; + static const XMVECTORF32 ControlYXWZ = {-1.0f, 1.0f, 1.0f,-1.0f}; + // Copy to SSE registers and use as few as possible for x86 + XMVECTOR Q2X = Q2; + XMVECTOR Q2Y = Q2; + XMVECTOR Q2Z = Q2; + XMVECTOR vResult = Q2; + // Splat with one instruction + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,3,3,3)); + Q2X = XM_PERMUTE_PS(Q2X,_MM_SHUFFLE(0,0,0,0)); + Q2Y = XM_PERMUTE_PS(Q2Y,_MM_SHUFFLE(1,1,1,1)); + Q2Z = XM_PERMUTE_PS(Q2Z,_MM_SHUFFLE(2,2,2,2)); + // Retire Q1 and perform Q1*Q2W + vResult = _mm_mul_ps(vResult,Q1); + XMVECTOR Q1Shuffle = Q1; + // Shuffle the copies of Q1 + Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle,_MM_SHUFFLE(0,1,2,3)); + // Mul by Q1WZYX + Q2X = _mm_mul_ps(Q2X,Q1Shuffle); + Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle,_MM_SHUFFLE(2,3,0,1)); + // Flip the signs on y and z + Q2X = _mm_mul_ps(Q2X,ControlWZYX); + // Mul by Q1ZWXY + Q2Y = _mm_mul_ps(Q2Y,Q1Shuffle); + Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle,_MM_SHUFFLE(0,1,2,3)); + // Flip the signs on z and w + Q2Y = _mm_mul_ps(Q2Y,ControlZWXY); + // Mul by Q1YXWZ + Q2Z = _mm_mul_ps(Q2Z,Q1Shuffle); + vResult = _mm_add_ps(vResult,Q2X); + // Flip the signs on x and w + Q2Z = _mm_mul_ps(Q2Z,ControlYXWZ); + Q2Y = _mm_add_ps(Q2Y,Q2Z); + vResult = _mm_add_ps(vResult,Q2Y); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMQuaternionLengthSq +( + FXMVECTOR Q +) +{ + return XMVector4LengthSq(Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMQuaternionReciprocalLength +( + FXMVECTOR Q +) +{ + return XMVector4ReciprocalLength(Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMQuaternionLength +( + FXMVECTOR Q +) +{ + return XMVector4Length(Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMQuaternionNormalizeEst +( + FXMVECTOR Q +) +{ + return XMVector4NormalizeEst(Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMQuaternionNormalize +( + FXMVECTOR Q +) +{ + return XMVector4Normalize(Q); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMQuaternionConjugate +( + FXMVECTOR Q +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result = { + -Q.vector4_f32[0], + -Q.vector4_f32[1], + -Q.vector4_f32[2], + Q.vector4_f32[3] + }; + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 NegativeOne3 = {-1.0f,-1.0f,-1.0f,1.0f}; + return vmulq_f32(Q, NegativeOne3.v ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 NegativeOne3 = {-1.0f,-1.0f,-1.0f,1.0f}; + return _mm_mul_ps(Q,NegativeOne3); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMQuaternionInverse +( + FXMVECTOR Q +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + const XMVECTOR Zero = XMVectorZero(); + + XMVECTOR L = XMVector4LengthSq(Q); + XMVECTOR Conjugate = XMQuaternionConjugate(Q); + + XMVECTOR Control = XMVectorLessOrEqual(L, g_XMEpsilon.v); + + XMVECTOR Result = XMVectorDivide(Conjugate, L); + + Result = XMVectorSelect(Result, Zero, Control); + + return Result; + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMQuaternionLn +( + FXMVECTOR Q +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f}; + + XMVECTOR QW = XMVectorSplatW(Q); + XMVECTOR Q0 = XMVectorSelect(g_XMSelect1110.v, Q, g_XMSelect1110.v); + + XMVECTOR ControlW = XMVectorInBounds(QW, OneMinusEpsilon.v); + + XMVECTOR Theta = XMVectorACos(QW); + XMVECTOR SinTheta = XMVectorSin(Theta); + + XMVECTOR S = XMVectorDivide(Theta,SinTheta); + + XMVECTOR Result = XMVectorMultiply(Q0, S); + Result = XMVectorSelect(Q0, Result, ControlW); + + return Result; + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMQuaternionExp +( + FXMVECTOR Q +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR Theta = XMVector3Length(Q); + + XMVECTOR SinTheta, CosTheta; + XMVectorSinCos(&SinTheta, &CosTheta, Theta); + + XMVECTOR S = XMVectorDivide(SinTheta, Theta); + + XMVECTOR Result = XMVectorMultiply(Q, S); + + const XMVECTOR Zero = XMVectorZero(); + XMVECTOR Control = XMVectorNearEqual(Theta, Zero, g_XMEpsilon.v); + Result = XMVectorSelect(Result, Q, Control); + + Result = XMVectorSelect(CosTheta, Result, g_XMSelect1110.v); + + return Result; + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMQuaternionSlerp +( + FXMVECTOR Q0, + FXMVECTOR Q1, + float t +) +{ + XMVECTOR T = XMVectorReplicate(t); + return XMQuaternionSlerpV(Q0, Q1, T); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMQuaternionSlerpV +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR T +) +{ + assert((XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T))); + + // Result = Q0 * sin((1.0 - t) * Omega) / sin(Omega) + Q1 * sin(t * Omega) / sin(Omega) + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + const XMVECTORF32 OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f}; + + XMVECTOR CosOmega = XMQuaternionDot(Q0, Q1); + + const XMVECTOR Zero = XMVectorZero(); + XMVECTOR Control = XMVectorLess(CosOmega, Zero); + XMVECTOR Sign = XMVectorSelect(g_XMOne.v, g_XMNegativeOne.v, Control); + + CosOmega = XMVectorMultiply(CosOmega, Sign); + + Control = XMVectorLess(CosOmega, OneMinusEpsilon); + + XMVECTOR SinOmega = XMVectorNegativeMultiplySubtract(CosOmega, CosOmega, g_XMOne.v); + SinOmega = XMVectorSqrt(SinOmega); + + XMVECTOR Omega = XMVectorATan2(SinOmega, CosOmega); + + XMVECTOR SignMask = XMVectorSplatSignMask(); + XMVECTOR V01 = XMVectorShiftLeft(T, Zero, 2); + SignMask = XMVectorShiftLeft(SignMask, Zero, 3); + V01 = XMVectorXorInt(V01, SignMask); + V01 = XMVectorAdd(g_XMIdentityR0.v, V01); + + XMVECTOR InvSinOmega = XMVectorReciprocal(SinOmega); + + XMVECTOR S0 = XMVectorMultiply(V01, Omega); + S0 = XMVectorSin(S0); + S0 = XMVectorMultiply(S0, InvSinOmega); + + S0 = XMVectorSelect(V01, S0, Control); + + XMVECTOR S1 = XMVectorSplatY(S0); + S0 = XMVectorSplatX(S0); + + S1 = XMVectorMultiply(S1, Sign); + + XMVECTOR Result = XMVectorMultiply(Q0, S0); + Result = XMVectorMultiplyAdd(Q1, S1, Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f}; + static const XMVECTORI32 SignMask2 = {0x80000000,0x00000000,0x00000000,0x00000000}; + static const XMVECTORI32 MaskXY = {0xFFFFFFFF,0xFFFFFFFF,0x00000000,0x00000000}; + + XMVECTOR CosOmega = XMQuaternionDot(Q0, Q1); + + const XMVECTOR Zero = XMVectorZero(); + XMVECTOR Control = XMVectorLess(CosOmega, Zero); + XMVECTOR Sign = XMVectorSelect(g_XMOne, g_XMNegativeOne, Control); + + CosOmega = _mm_mul_ps(CosOmega, Sign); + + Control = XMVectorLess(CosOmega, OneMinusEpsilon); + + XMVECTOR SinOmega = _mm_mul_ps(CosOmega,CosOmega); + SinOmega = _mm_sub_ps(g_XMOne,SinOmega); + SinOmega = _mm_sqrt_ps(SinOmega); + + XMVECTOR Omega = XMVectorATan2(SinOmega, CosOmega); + + XMVECTOR V01 = XM_PERMUTE_PS(T,_MM_SHUFFLE(2,3,0,1)); + V01 = _mm_and_ps(V01,MaskXY); + V01 = _mm_xor_ps(V01,SignMask2); + V01 = _mm_add_ps(g_XMIdentityR0, V01); + + XMVECTOR S0 = _mm_mul_ps(V01, Omega); + S0 = XMVectorSin(S0); + S0 = _mm_div_ps(S0, SinOmega); + + S0 = XMVectorSelect(V01, S0, Control); + + XMVECTOR S1 = XMVectorSplatY(S0); + S0 = XMVectorSplatX(S0); + + S1 = _mm_mul_ps(S1, Sign); + XMVECTOR Result = _mm_mul_ps(Q0, S0); + S1 = _mm_mul_ps(S1, Q1); + Result = _mm_add_ps(Result,S1); + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMQuaternionSquad +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + GXMVECTOR Q3, + float t +) +{ + XMVECTOR T = XMVectorReplicate(t); + return XMQuaternionSquadV(Q0, Q1, Q2, Q3, T); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMQuaternionSquadV +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + GXMVECTOR Q3, + CXMVECTOR T +) +{ + assert( (XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T)) ); + + XMVECTOR TP = T; + const XMVECTOR Two = XMVectorSplatConstant(2, 0); + + XMVECTOR Q03 = XMQuaternionSlerpV(Q0, Q3, T); + XMVECTOR Q12 = XMQuaternionSlerpV(Q1, Q2, T); + + TP = XMVectorNegativeMultiplySubtract(TP, TP, TP); + TP = XMVectorMultiply(TP, Two); + + XMVECTOR Result = XMQuaternionSlerpV(Q03, Q12, TP); + + return Result; +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMQuaternionSquadSetup +( + XMVECTOR* pA, + XMVECTOR* pB, + XMVECTOR* pC, + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + GXMVECTOR Q3 +) +{ + assert(pA); + assert(pB); + assert(pC); + + XMVECTOR LS12 = XMQuaternionLengthSq(XMVectorAdd(Q1, Q2)); + XMVECTOR LD12 = XMQuaternionLengthSq(XMVectorSubtract(Q1, Q2)); + XMVECTOR SQ2 = XMVectorNegate(Q2); + + XMVECTOR Control1 = XMVectorLess(LS12, LD12); + SQ2 = XMVectorSelect(Q2, SQ2, Control1); + + XMVECTOR LS01 = XMQuaternionLengthSq(XMVectorAdd(Q0, Q1)); + XMVECTOR LD01 = XMQuaternionLengthSq(XMVectorSubtract(Q0, Q1)); + XMVECTOR SQ0 = XMVectorNegate(Q0); + + XMVECTOR LS23 = XMQuaternionLengthSq(XMVectorAdd(SQ2, Q3)); + XMVECTOR LD23 = XMQuaternionLengthSq(XMVectorSubtract(SQ2, Q3)); + XMVECTOR SQ3 = XMVectorNegate(Q3); + + XMVECTOR Control0 = XMVectorLess(LS01, LD01); + XMVECTOR Control2 = XMVectorLess(LS23, LD23); + + SQ0 = XMVectorSelect(Q0, SQ0, Control0); + SQ3 = XMVectorSelect(Q3, SQ3, Control2); + + XMVECTOR InvQ1 = XMQuaternionInverse(Q1); + XMVECTOR InvQ2 = XMQuaternionInverse(SQ2); + + XMVECTOR LnQ0 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ0)); + XMVECTOR LnQ2 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ2)); + XMVECTOR LnQ1 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, Q1)); + XMVECTOR LnQ3 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, SQ3)); + + const XMVECTOR NegativeOneQuarter = XMVectorSplatConstant(-1, 2); + + XMVECTOR ExpQ02 = XMVectorMultiply(XMVectorAdd(LnQ0, LnQ2), NegativeOneQuarter); + XMVECTOR ExpQ13 = XMVectorMultiply(XMVectorAdd(LnQ1, LnQ3), NegativeOneQuarter); + ExpQ02 = XMQuaternionExp(ExpQ02); + ExpQ13 = XMQuaternionExp(ExpQ13); + + *pA = XMQuaternionMultiply(Q1, ExpQ02); + *pB = XMQuaternionMultiply(SQ2, ExpQ13); + *pC = SQ2; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMQuaternionBaryCentric +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + float f, + float g +) +{ + float s = f + g; + + XMVECTOR Result; + if ((s < 0.00001f) && (s > -0.00001f)) + { + Result = Q0; + } + else + { + XMVECTOR Q01 = XMQuaternionSlerp(Q0, Q1, s); + XMVECTOR Q02 = XMQuaternionSlerp(Q0, Q2, s); + + Result = XMQuaternionSlerp(Q01, Q02, g / s); + } + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMQuaternionBaryCentricV +( + FXMVECTOR Q0, + FXMVECTOR Q1, + FXMVECTOR Q2, + GXMVECTOR F, + CXMVECTOR G +) +{ + assert( (XMVectorGetY(F) == XMVectorGetX(F)) && (XMVectorGetZ(F) == XMVectorGetX(F)) && (XMVectorGetW(F) == XMVectorGetX(F)) ); + assert( (XMVectorGetY(G) == XMVectorGetX(G)) && (XMVectorGetZ(G) == XMVectorGetX(G)) && (XMVectorGetW(G) == XMVectorGetX(G)) ); + + const XMVECTOR Epsilon = XMVectorSplatConstant(1, 16); + + XMVECTOR S = XMVectorAdd(F, G); + + XMVECTOR Result; + if (XMVector4InBounds(S, Epsilon)) + { + Result = Q0; + } + else + { + XMVECTOR Q01 = XMQuaternionSlerpV(Q0, Q1, S); + XMVECTOR Q02 = XMQuaternionSlerpV(Q0, Q2, S); + XMVECTOR GS = XMVectorReciprocal(S); + GS = XMVectorMultiply(G, GS); + + Result = XMQuaternionSlerpV(Q01, Q02, GS); + } + + return Result; +} + +//------------------------------------------------------------------------------ +// Transformation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMQuaternionIdentity() +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + return g_XMIdentityR3.v; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMQuaternionRotationRollPitchYaw +( + float Pitch, + float Yaw, + float Roll +) +{ + XMVECTOR Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f); + XMVECTOR Q = XMQuaternionRotationRollPitchYawFromVector(Angles); + return Q; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMQuaternionRotationRollPitchYawFromVector +( + FXMVECTOR Angles // <Pitch, Yaw, Roll, 0> +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 Sign = {1.0f, -1.0f, -1.0f, 1.0f}; + + XMVECTOR HalfAngles = XMVectorMultiply(Angles, g_XMOneHalf.v); + + XMVECTOR SinAngles, CosAngles; + XMVectorSinCos(&SinAngles, &CosAngles, HalfAngles); + + XMVECTOR P0 = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1X, XM_PERMUTE_1X>(SinAngles, CosAngles); + XMVECTOR Y0 = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Y>(SinAngles, CosAngles); + XMVECTOR R0 = XMVectorPermute<XM_PERMUTE_1Z, XM_PERMUTE_1Z, XM_PERMUTE_0Z, XM_PERMUTE_1Z>(SinAngles, CosAngles); + XMVECTOR P1 = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1X, XM_PERMUTE_1X>(CosAngles, SinAngles); + XMVECTOR Y1 = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Y>(CosAngles, SinAngles); + XMVECTOR R1 = XMVectorPermute<XM_PERMUTE_1Z, XM_PERMUTE_1Z, XM_PERMUTE_0Z, XM_PERMUTE_1Z>(CosAngles, SinAngles); + + XMVECTOR Q1 = XMVectorMultiply(P1, Sign.v); + XMVECTOR Q0 = XMVectorMultiply(P0, Y0); + Q1 = XMVectorMultiply(Q1, Y1); + Q0 = XMVectorMultiply(Q0, R0); + XMVECTOR Q = XMVectorMultiplyAdd(Q1, R1, Q0); + + return Q; + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMQuaternionRotationNormal +( + FXMVECTOR NormalAxis, + float Angle +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR N = XMVectorSelect(g_XMOne.v, NormalAxis, g_XMSelect1110.v); + + float SinV, CosV; + XMScalarSinCos(&SinV, &CosV, 0.5f * Angle); + + XMVECTOR Scale = XMVectorSet( SinV, SinV, SinV, CosV ); + return XMVectorMultiply(N, Scale); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR N = _mm_and_ps(NormalAxis,g_XMMask3); + N = _mm_or_ps(N,g_XMIdentityR3); + XMVECTOR Scale = _mm_set_ps1(0.5f * Angle); + XMVECTOR vSine; + XMVECTOR vCosine; + XMVectorSinCos(&vSine,&vCosine,Scale); + Scale = _mm_and_ps(vSine,g_XMMask3); + vCosine = _mm_and_ps(vCosine,g_XMMaskW); + Scale = _mm_or_ps(Scale,vCosine); + N = _mm_mul_ps(N,Scale); + return N; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMQuaternionRotationAxis +( + FXMVECTOR Axis, + float Angle +) +{ + assert(!XMVector3Equal(Axis, XMVectorZero())); + assert(!XMVector3IsInfinite(Axis)); + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR Normal = XMVector3Normalize(Axis); + XMVECTOR Q = XMQuaternionRotationNormal(Normal, Angle); + return Q; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMQuaternionRotationMatrix +( + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTORF32 q; + float r22 = M.m[2][2]; + if (r22 <= 0.f) // x^2 + y^2 >= z^2 + w^2 + { + float dif10 = M.m[1][1] - M.m[0][0]; + float omr22 = 1.f - r22; + if (dif10 <= 0.f) // x^2 >= y^2 + { + float fourXSqr = omr22 - dif10; + float inv4x = 0.5f / sqrtf(fourXSqr); + q.f[0] = fourXSqr*inv4x; + q.f[1] = (M.m[0][1] + M.m[1][0])*inv4x; + q.f[2] = (M.m[0][2] + M.m[2][0])*inv4x; + q.f[3] = (M.m[1][2] - M.m[2][1])*inv4x; + } + else // y^2 >= x^2 + { + float fourYSqr = omr22 + dif10; + float inv4y = 0.5f / sqrtf(fourYSqr); + q.f[0] = (M.m[0][1] + M.m[1][0])*inv4y; + q.f[1] = fourYSqr*inv4y; + q.f[2] = (M.m[1][2] + M.m[2][1])*inv4y; + q.f[3] = (M.m[2][0] - M.m[0][2])*inv4y; + } + } + else // z^2 + w^2 >= x^2 + y^2 + { + float sum10 = M.m[1][1] + M.m[0][0]; + float opr22 = 1.f + r22; + if (sum10 <= 0.f) // z^2 >= w^2 + { + float fourZSqr = opr22 - sum10; + float inv4z = 0.5f / sqrtf(fourZSqr); + q.f[0] = (M.m[0][2] + M.m[2][0])*inv4z; + q.f[1] = (M.m[1][2] + M.m[2][1])*inv4z; + q.f[2] = fourZSqr*inv4z; + q.f[3] = (M.m[0][1] - M.m[1][0])*inv4z; + } + else // w^2 >= z^2 + { + float fourWSqr = opr22 + sum10; + float inv4w = 0.5f / sqrtf(fourWSqr); + q.f[0] = (M.m[1][2] - M.m[2][1])*inv4w; + q.f[1] = (M.m[2][0] - M.m[0][2])*inv4w; + q.f[2] = (M.m[0][1] - M.m[1][0])*inv4w; + q.f[3] = fourWSqr*inv4w; + } + } + return q.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 XMPMMP = {+1.0f, -1.0f, -1.0f, +1.0f}; + static const XMVECTORF32 XMMPMP = {-1.0f, +1.0f, -1.0f, +1.0f}; + static const XMVECTORF32 XMMMPP = {-1.0f, -1.0f, +1.0f, +1.0f}; + static const XMVECTORU32 Select0110 = { XM_SELECT_0, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0 }; + static const XMVECTORU32 Select0010 = { XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 }; + + XMVECTOR r0 = M.r[0]; + XMVECTOR r1 = M.r[1]; + XMVECTOR r2 = M.r[2]; + + XMVECTOR r00 = vdupq_lane_f32(vget_low_f32(r0), 0); + XMVECTOR r11 = vdupq_lane_f32(vget_low_f32(r1), 1); + XMVECTOR r22 = vdupq_lane_f32(vget_high_f32(r2), 0); + + // x^2 >= y^2 equivalent to r11 - r00 <= 0 + XMVECTOR r11mr00 = vsubq_f32(r11, r00); + XMVECTOR x2gey2 = vcleq_f32(r11mr00, g_XMZero); + + // z^2 >= w^2 equivalent to r11 + r00 <= 0 + XMVECTOR r11pr00 = vaddq_f32(r11, r00); + XMVECTOR z2gew2 = vcleq_f32(r11pr00, g_XMZero); + + // x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0 + XMVECTOR x2py2gez2pw2 = vcleq_f32(r22, g_XMZero); + + // (4*x^2, 4*y^2, 4*z^2, 4*w^2) + XMVECTOR t0 = vmulq_f32( XMPMMP, r00 ); + XMVECTOR x2y2z2w2 = vmlaq_f32( t0, XMMPMP, r11 ); + x2y2z2w2 = vmlaq_f32( x2y2z2w2, XMMMPP, r22 ); + x2y2z2w2 = vaddq_f32( x2y2z2w2, g_XMOne ); + + // (r01, r02, r12, r11) + t0 = vextq_f32(r0, r0, 1); + XMVECTOR t1 = vextq_f32(r1, r1, 1); + t0 = vcombine_f32( vget_low_f32(t0), vrev64_f32( vget_low_f32( t1 ) ) ); + + // (r10, r20, r21, r10) + t1 = vextq_f32(r2, r2, 3); + XMVECTOR r10 = vdupq_lane_f32( vget_low_f32(r1), 0 ); + t1 = vbslq_f32( Select0110, t1, r10 ); + + // (4*x*y, 4*x*z, 4*y*z, unused) + XMVECTOR xyxzyz = vaddq_f32(t0, t1); + + // (r21, r20, r10, r10) + t0 = vcombine_f32( vrev64_f32( vget_low_f32(r2) ), vget_low_f32(r10) ); + + // (r12, r02, r01, r12) + XMVECTOR t2 = vcombine_f32( vrev64_f32( vget_high_f32(r0) ), vrev64_f32( vget_low_f32(r0) ) ); + XMVECTOR t3 = vdupq_lane_f32( vget_high_f32(r1), 0 ); + t1 = vbslq_f32( Select0110, t2, t3 ); + + // (4*x*w, 4*y*w, 4*z*w, unused) + XMVECTOR xwywzw = vsubq_f32(t0, t1); + xwywzw = vmulq_f32(XMMPMP, xwywzw); + + // (4*x*x, 4*x*y, 4*x*z, 4*x*w) + t0 = vextq_f32( xyxzyz, xyxzyz, 3 ); + t1 = vbslq_f32( Select0110, t0, x2y2z2w2 ); + t2 = vdupq_lane_f32( vget_low_f32(xwywzw), 0 ); + XMVECTOR tensor0 = vbslq_f32( g_XMSelect1110, t1, t2 ); + + // (4*y*x, 4*y*y, 4*y*z, 4*y*w) + t0 = vbslq_f32( g_XMSelect1011, xyxzyz, x2y2z2w2 ); + t1 = vdupq_lane_f32( vget_low_f32(xwywzw), 1 ); + XMVECTOR tensor1 = vbslq_f32( g_XMSelect1110, t0, t1 ); + + // (4*z*x, 4*z*y, 4*z*z, 4*z*w) + t0 = vextq_f32(xyxzyz, xyxzyz, 1); + t1 = vcombine_f32( vget_low_f32(t0), vrev64_f32( vget_high_f32(xwywzw) ) ); + XMVECTOR tensor2 = vbslq_f32( Select0010, x2y2z2w2, t1 ); + + // (4*w*x, 4*w*y, 4*w*z, 4*w*w) + XMVECTOR tensor3 = vbslq_f32( g_XMSelect1110, xwywzw, x2y2z2w2 ); + + // Select the row of the tensor-product matrix that has the largest + // magnitude. + t0 = vbslq_f32( x2gey2, tensor0, tensor1 ); + t1 = vbslq_f32( z2gew2, tensor2, tensor3 ); + t2 = vbslq_f32( x2py2gez2pw2, t0, t1 ); + + // Normalize the row. No division by zero is possible because the + // quaternion is unit-length (and the row is a nonzero multiple of + // the quaternion). + t0 = XMVector4Length(t2); + return XMVectorDivide(t2, t0); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 XMPMMP = {+1.0f, -1.0f, -1.0f, +1.0f}; + static const XMVECTORF32 XMMPMP = {-1.0f, +1.0f, -1.0f, +1.0f}; + static const XMVECTORF32 XMMMPP = {-1.0f, -1.0f, +1.0f, +1.0f}; + + XMVECTOR r0 = M.r[0]; // (r00, r01, r02, 0) + XMVECTOR r1 = M.r[1]; // (r10, r11, r12, 0) + XMVECTOR r2 = M.r[2]; // (r20, r21, r22, 0) + + // (r00, r00, r00, r00) + XMVECTOR r00 = XM_PERMUTE_PS(r0, _MM_SHUFFLE(0,0,0,0)); + // (r11, r11, r11, r11) + XMVECTOR r11 = XM_PERMUTE_PS(r1, _MM_SHUFFLE(1,1,1,1)); + // (r22, r22, r22, r22) + XMVECTOR r22 = XM_PERMUTE_PS(r2, _MM_SHUFFLE(2,2,2,2)); + + // x^2 >= y^2 equivalent to r11 - r00 <= 0 + // (r11 - r00, r11 - r00, r11 - r00, r11 - r00) + XMVECTOR r11mr00 = _mm_sub_ps(r11, r00); + XMVECTOR x2gey2 = _mm_cmple_ps(r11mr00, g_XMZero); + + // z^2 >= w^2 equivalent to r11 + r00 <= 0 + // (r11 + r00, r11 + r00, r11 + r00, r11 + r00) + XMVECTOR r11pr00 = _mm_add_ps(r11, r00); + XMVECTOR z2gew2 = _mm_cmple_ps(r11pr00, g_XMZero); + + // x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0 + XMVECTOR x2py2gez2pw2 = _mm_cmple_ps(r22, g_XMZero); + + // (+r00, -r00, -r00, +r00) + XMVECTOR t0 = _mm_mul_ps(XMPMMP, r00); + + // (-r11, +r11, -r11, +r11) + XMVECTOR t1 = _mm_mul_ps(XMMPMP, r11); + + // (-r22, -r22, +r22, +r22) + XMVECTOR t2 = _mm_mul_ps(XMMMPP, r22); + + // (4*x^2, 4*y^2, 4*z^2, 4*w^2) + XMVECTOR x2y2z2w2 = _mm_add_ps(t0, t1); + x2y2z2w2 = _mm_add_ps(t2, x2y2z2w2); + x2y2z2w2 = _mm_add_ps(x2y2z2w2, g_XMOne); + + // (r01, r02, r12, r11) + t0 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(1,2,2,1)); + // (r10, r10, r20, r21) + t1 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(1,0,0,0)); + // (r10, r20, r21, r10) + t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1,3,2,0)); + // (4*x*y, 4*x*z, 4*y*z, unused) + XMVECTOR xyxzyz = _mm_add_ps(t0, t1); + + // (r21, r20, r10, r10) + t0 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0,0,0,1)); + // (r12, r12, r02, r01) + t1 = _mm_shuffle_ps(r1, r0, _MM_SHUFFLE(1,2,2,2)); + // (r12, r02, r01, r12) + t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1,3,2,0)); + // (4*x*w, 4*y*w, 4*z*w, unused) + XMVECTOR xwywzw = _mm_sub_ps(t0, t1); + xwywzw = _mm_mul_ps(XMMPMP, xwywzw); + + // (4*x^2, 4*y^2, 4*x*y, unused) + t0 = _mm_shuffle_ps(x2y2z2w2, xyxzyz, _MM_SHUFFLE(0,0,1,0)); + // (4*z^2, 4*w^2, 4*z*w, unused) + t1 = _mm_shuffle_ps(x2y2z2w2, xwywzw, _MM_SHUFFLE(0,2,3,2)); + // (4*x*z, 4*y*z, 4*x*w, 4*y*w) + t2 = _mm_shuffle_ps(xyxzyz, xwywzw, _MM_SHUFFLE(1,0,2,1)); + + // (4*x*x, 4*x*y, 4*x*z, 4*x*w) + XMVECTOR tensor0 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,0,2,0)); + // (4*y*x, 4*y*y, 4*y*z, 4*y*w) + XMVECTOR tensor1 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,1,1,2)); + // (4*z*x, 4*z*y, 4*z*z, 4*z*w) + XMVECTOR tensor2 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(2,0,1,0)); + // (4*w*x, 4*w*y, 4*w*z, 4*w*w) + XMVECTOR tensor3 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(1,2,3,2)); + + // Select the row of the tensor-product matrix that has the largest + // magnitude. + t0 = _mm_and_ps(x2gey2, tensor0); + t1 = _mm_andnot_ps(x2gey2, tensor1); + t0 = _mm_or_ps(t0, t1); + t1 = _mm_and_ps(z2gew2, tensor2); + t2 = _mm_andnot_ps(z2gew2, tensor3); + t1 = _mm_or_ps(t1, t2); + t0 = _mm_and_ps(x2py2gez2pw2, t0); + t1 = _mm_andnot_ps(x2py2gez2pw2, t1); + t2 = _mm_or_ps(t0, t1); + + // Normalize the row. No division by zero is possible because the + // quaternion is unit-length (and the row is a nonzero multiple of + // the quaternion). + t0 = XMVector4Length(t2); + return _mm_div_ps(t2, t0); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Conversion operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMQuaternionToAxisAngle +( + XMVECTOR* pAxis, + float* pAngle, + FXMVECTOR Q +) +{ + assert(pAxis); + assert(pAngle); + + *pAxis = Q; + + *pAngle = 2.0f * XMScalarACos(XMVectorGetW(Q)); +} + +/**************************************************************************** + * + * Plane + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline bool XMPlaneEqual +( + FXMVECTOR P1, + FXMVECTOR P2 +) +{ + return XMVector4Equal(P1, P2); +} + +//------------------------------------------------------------------------------ + +inline bool XMPlaneNearEqual +( + FXMVECTOR P1, + FXMVECTOR P2, + FXMVECTOR Epsilon +) +{ + XMVECTOR NP1 = XMPlaneNormalize(P1); + XMVECTOR NP2 = XMPlaneNormalize(P2); + return XMVector4NearEqual(NP1, NP2, Epsilon); +} + +//------------------------------------------------------------------------------ + +inline bool XMPlaneNotEqual +( + FXMVECTOR P1, + FXMVECTOR P2 +) +{ + return XMVector4NotEqual(P1, P2); +} + +//------------------------------------------------------------------------------ + +inline bool XMPlaneIsNaN +( + FXMVECTOR P +) +{ + return XMVector4IsNaN(P); +} + +//------------------------------------------------------------------------------ + +inline bool XMPlaneIsInfinite +( + FXMVECTOR P +) +{ + return XMVector4IsInfinite(P); +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMPlaneDot +( + FXMVECTOR P, + FXMVECTOR V +) +{ + return XMVector4Dot(P, V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMPlaneDotCoord +( + FXMVECTOR P, + FXMVECTOR V +) +{ + // Result = P[0] * V[0] + P[1] * V[1] + P[2] * V[2] + P[3] + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR V3 = XMVectorSelect(g_XMOne.v, V, g_XMSelect1110.v); + XMVECTOR Result = XMVector4Dot(P, V3); + return Result; + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMPlaneDotNormal +( + FXMVECTOR P, + FXMVECTOR V +) +{ + return XMVector3Dot(P, V); +} + +//------------------------------------------------------------------------------ +// XMPlaneNormalizeEst uses a reciprocal estimate and +// returns QNaN on zero and infinite vectors. + +inline XMVECTOR XMPlaneNormalizeEst +( + FXMVECTOR P +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR Result = XMVector3ReciprocalLengthEst(P); + return XMVectorMultiply(P, Result); + +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product + XMVECTOR vDot = _mm_mul_ps(P,P); + // x=Dot.y, y=Dot.z + XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1)); + // Result.x = x+y + vDot = _mm_add_ss(vDot,vTemp); + // x=Dot.z + vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); + // Result.x = (x+y)+z + vDot = _mm_add_ss(vDot,vTemp); + // Splat x + vDot = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0)); + // Get the reciprocal + vDot = _mm_rsqrt_ps(vDot); + // Get the reciprocal + vDot = _mm_mul_ps(vDot,P); + return vDot; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMPlaneNormalize +( + FXMVECTOR P +) +{ +#if defined(_XM_NO_INTRINSICS_) + float fLengthSq = sqrtf((P.vector4_f32[0]*P.vector4_f32[0])+(P.vector4_f32[1]*P.vector4_f32[1])+(P.vector4_f32[2]*P.vector4_f32[2])); + // Prevent divide by zero + if (fLengthSq) { + fLengthSq = 1.0f/fLengthSq; + } + { + XMVECTOR vResult = { + P.vector4_f32[0]*fLengthSq, + P.vector4_f32[1]*fLengthSq, + P.vector4_f32[2]*fLengthSq, + P.vector4_f32[3]*fLengthSq + }; + return vResult; + } +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vLength = XMVector3ReciprocalLength(P); + return XMVectorMultiply( P, vLength ); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z only + XMVECTOR vLengthSq = _mm_mul_ps(P,P); + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,1,2,1)); + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(P,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vLengthSq); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMPlaneIntersectLine +( + FXMVECTOR P, + FXMVECTOR LinePoint1, + FXMVECTOR LinePoint2 +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR V1 = XMVector3Dot(P, LinePoint1); + XMVECTOR V2 = XMVector3Dot(P, LinePoint2); + XMVECTOR D = XMVectorSubtract(V1, V2); + + XMVECTOR VT = XMPlaneDotCoord(P, LinePoint1); + VT = XMVectorDivide(VT, D); + + XMVECTOR Point = XMVectorSubtract(LinePoint2, LinePoint1); + Point = XMVectorMultiplyAdd(Point, VT, LinePoint1); + + const XMVECTOR Zero = XMVectorZero(); + XMVECTOR Control = XMVectorNearEqual(D, Zero, g_XMEpsilon.v); + + return XMVectorSelect(Point, g_XMQNaN.v, Control); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMPlaneIntersectPlane +( + XMVECTOR* pLinePoint1, + XMVECTOR* pLinePoint2, + FXMVECTOR P1, + FXMVECTOR P2 +) +{ + assert(pLinePoint1); + assert(pLinePoint2); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR V1 = XMVector3Cross(P2, P1); + + XMVECTOR LengthSq = XMVector3LengthSq(V1); + + XMVECTOR V2 = XMVector3Cross(P2, V1); + + XMVECTOR P1W = XMVectorSplatW(P1); + XMVECTOR Point = XMVectorMultiply(V2, P1W); + + XMVECTOR V3 = XMVector3Cross(V1, P1); + + XMVECTOR P2W = XMVectorSplatW(P2); + Point = XMVectorMultiplyAdd(V3, P2W, Point); + + XMVECTOR LinePoint1 = XMVectorDivide(Point, LengthSq); + + XMVECTOR LinePoint2 = XMVectorAdd(LinePoint1, V1); + + XMVECTOR Control = XMVectorLessOrEqual(LengthSq, g_XMEpsilon.v); + *pLinePoint1 = XMVectorSelect(LinePoint1,g_XMQNaN.v, Control); + *pLinePoint2 = XMVectorSelect(LinePoint2,g_XMQNaN.v, Control); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMPlaneTransform +( + FXMVECTOR P, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR W = XMVectorSplatW(P); + XMVECTOR Z = XMVectorSplatZ(P); + XMVECTOR Y = XMVectorSplatY(P); + XMVECTOR X = XMVectorSplatX(P); + + XMVECTOR Result = XMVectorMultiply(W, M.r[3]); + Result = XMVectorMultiplyAdd(Z, M.r[2], Result); + Result = XMVectorMultiplyAdd(Y, M.r[1], Result); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + return Result; + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT4* XMPlaneTransformStream +( + XMFLOAT4* pOutputStream, + size_t OutputStride, + const XMFLOAT4* pInputStream, + size_t InputStride, + size_t PlaneCount, + CXMMATRIX M +) +{ + return XMVector4TransformStream(pOutputStream, + OutputStride, + pInputStream, + InputStride, + PlaneCount, + M); +} + +//------------------------------------------------------------------------------ +// Conversion operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMPlaneFromPointNormal +( + FXMVECTOR Point, + FXMVECTOR Normal +) +{ + XMVECTOR W = XMVector3Dot(Point, Normal); + W = XMVectorNegate(W); + return XMVectorSelect(W, Normal, g_XMSelect1110.v); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMPlaneFromPoints +( + FXMVECTOR Point1, + FXMVECTOR Point2, + FXMVECTOR Point3 +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR V21 = XMVectorSubtract(Point1, Point2); + XMVECTOR V31 = XMVectorSubtract(Point1, Point3); + + XMVECTOR N = XMVector3Cross(V21, V31); + N = XMVector3Normalize(N); + + XMVECTOR D = XMPlaneDotNormal(N, Point1); + D = XMVectorNegate(D); + + XMVECTOR Result = XMVectorSelect(D, N, g_XMSelect1110.v); + + return Result; + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +/**************************************************************************** + * + * Color + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline bool XMColorEqual +( + FXMVECTOR C1, + FXMVECTOR C2 +) +{ + return XMVector4Equal(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XMColorNotEqual +( + FXMVECTOR C1, + FXMVECTOR C2 +) +{ + return XMVector4NotEqual(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XMColorGreater +( + FXMVECTOR C1, + FXMVECTOR C2 +) +{ + return XMVector4Greater(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XMColorGreaterOrEqual +( + FXMVECTOR C1, + FXMVECTOR C2 +) +{ + return XMVector4GreaterOrEqual(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XMColorLess +( + FXMVECTOR C1, + FXMVECTOR C2 +) +{ + return XMVector4Less(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XMColorLessOrEqual +( + FXMVECTOR C1, + FXMVECTOR C2 +) +{ + return XMVector4LessOrEqual(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline bool XMColorIsNaN +( + FXMVECTOR C +) +{ + return XMVector4IsNaN(C); +} + +//------------------------------------------------------------------------------ + +inline bool XMColorIsInfinite +( + FXMVECTOR C +) +{ + return XMVector4IsInfinite(C); +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMColorNegative +( + FXMVECTOR vColor +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + 1.0f - vColor.vector4_f32[0], + 1.0f - vColor.vector4_f32[1], + 1.0f - vColor.vector4_f32[2], + vColor.vector4_f32[3] + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vTemp = veorq_u32(vColor,g_XMNegate3); + return vaddq_f32(vTemp,g_XMOne3); +#elif defined(_XM_SSE_INTRINSICS_) + // Negate only x,y and z. + XMVECTOR vTemp = _mm_xor_ps(vColor,g_XMNegate3); + // Add 1,1,1,0 to -x,-y,-z,w + return _mm_add_ps(vTemp,g_XMOne3); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMColorModulate +( + FXMVECTOR C1, + FXMVECTOR C2 +) +{ + return XMVectorMultiply(C1, C2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMColorAdjustSaturation +( + FXMVECTOR vColor, + float fSaturation +) +{ + // Luminance = 0.2125f * C[0] + 0.7154f * C[1] + 0.0721f * C[2]; + // Result = (C - Luminance) * Saturation + Luminance; + +#if defined(_XM_NO_INTRINSICS_) + const XMVECTORF32 gvLuminance = {0.2125f, 0.7154f, 0.0721f, 0.0f}; + + float fLuminance = (vColor.vector4_f32[0]*gvLuminance.f[0])+(vColor.vector4_f32[1]*gvLuminance.f[1])+(vColor.vector4_f32[2]*gvLuminance.f[2]); + XMVECTORF32 vResult = { + ((vColor.vector4_f32[0] - fLuminance)*fSaturation)+fLuminance, + ((vColor.vector4_f32[1] - fLuminance)*fSaturation)+fLuminance, + ((vColor.vector4_f32[2] - fLuminance)*fSaturation)+fLuminance, + vColor.vector4_f32[3]}; + return vResult.v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 gvLuminance = {0.2125f, 0.7154f, 0.0721f, 0.0f}; + XMVECTOR vLuminance = XMVector3Dot( vColor, gvLuminance ); + XMVECTOR vResult = vsubq_f32(vColor, vLuminance); + XMVECTOR vSaturation = vdupq_n_f32(fSaturation); + vResult = vmlaq_f32( vLuminance, vResult, vSaturation ); + return vbslq_f32( g_XMSelect1110, vResult, vColor ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 gvLuminance = {0.2125f, 0.7154f, 0.0721f, 0.0f}; + XMVECTOR vLuminance = XMVector3Dot( vColor, gvLuminance ); +// Splat fSaturation + XMVECTOR vSaturation = _mm_set_ps1(fSaturation); +// vResult = ((vColor-vLuminance)*vSaturation)+vLuminance; + XMVECTOR vResult = _mm_sub_ps(vColor,vLuminance); + vResult = _mm_mul_ps(vResult,vSaturation); + vResult = _mm_add_ps(vResult,vLuminance); +// Retain w from the source color + vLuminance = _mm_shuffle_ps(vResult,vColor,_MM_SHUFFLE(3,2,2,2)); // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w + vResult = _mm_shuffle_ps(vResult,vLuminance,_MM_SHUFFLE(3,0,1,0)); // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w + return vResult; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMColorAdjustContrast +( + FXMVECTOR vColor, + float fContrast +) +{ + // Result = (vColor - 0.5f) * fContrast + 0.5f; + +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + ((vColor.vector4_f32[0]-0.5f) * fContrast) + 0.5f, + ((vColor.vector4_f32[1]-0.5f) * fContrast) + 0.5f, + ((vColor.vector4_f32[2]-0.5f) * fContrast) + 0.5f, + vColor.vector4_f32[3] // Leave W untouched + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vResult = vsubq_f32(vColor, g_XMOneHalf.v); + XMVECTOR vContrast = vdupq_n_f32(fContrast); + vResult = vmlaq_f32( g_XMOneHalf.v, vResult, vContrast ); + return vbslq_f32( g_XMSelect1110, vResult, vColor ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vScale = _mm_set_ps1(fContrast); // Splat the scale + XMVECTOR vResult = _mm_sub_ps(vColor,g_XMOneHalf); // Subtract 0.5f from the source (Saving source) + vResult = _mm_mul_ps(vResult,vScale); // Mul by scale + vResult = _mm_add_ps(vResult,g_XMOneHalf); // Add 0.5f +// Retain w from the source color + vScale = _mm_shuffle_ps(vResult,vColor,_MM_SHUFFLE(3,2,2,2)); // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w + vResult = _mm_shuffle_ps(vResult,vScale,_MM_SHUFFLE(3,0,1,0)); // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w + return vResult; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMColorRGBToHSL( FXMVECTOR rgb ) +{ + XMVECTOR r = XMVectorSplatX( rgb ); + XMVECTOR g = XMVectorSplatY( rgb ); + XMVECTOR b = XMVectorSplatZ( rgb ); + + XMVECTOR min = XMVectorMin( r, XMVectorMin( g, b ) ); + XMVECTOR max = XMVectorMax( r, XMVectorMax( g, b ) ); + + XMVECTOR l = XMVectorMultiply( XMVectorAdd( min, max ), g_XMOneHalf ); + + XMVECTOR d = XMVectorSubtract( max, min ); + + XMVECTOR la = XMVectorSelect( rgb, l, g_XMSelect1110 ); + + if ( XMVector3Less( d, g_XMEpsilon ) ) + { + // Achromatic, assume H and S of 0 + return XMVectorSelect( la, g_XMZero, g_XMSelect1100 ); + } + else + { + XMVECTOR s, h; + + XMVECTOR d2 = XMVectorAdd( min, max ); + + if ( XMVector3Greater( l, g_XMOneHalf ) ) + { + // d / (2-max-min) + s = XMVectorDivide( d, XMVectorSubtract( g_XMTwo, d2 ) ); + } + else + { + // d / (max+min) + s = XMVectorDivide( d, d2 ); + } + + if ( XMVector3Equal( r, max ) ) + { + // Red is max + h = XMVectorDivide( XMVectorSubtract( g, b ), d ); + } + else if ( XMVector3Equal( g, max ) ) + { + // Green is max + h = XMVectorDivide( XMVectorSubtract( b, r ), d ); + h = XMVectorAdd( h, g_XMTwo ); + } + else + { + // Blue is max + h = XMVectorDivide( XMVectorSubtract( r, g ), d ); + h = XMVectorAdd( h, g_XMFour ); + } + + h = XMVectorDivide( h, g_XMSix ); + + if ( XMVector3Less( h, g_XMZero ) ) + h = XMVectorAdd( h, g_XMOne ); + + XMVECTOR lha = XMVectorSelect( la, h, g_XMSelect1100 ); + return XMVectorSelect( s, lha, g_XMSelect1011 ); + } +} + +//------------------------------------------------------------------------------ + +namespace Internal +{ + +inline XMVECTOR XMColorHue2Clr( FXMVECTOR p, FXMVECTOR q, FXMVECTOR h ) +{ + static const XMVECTORF32 oneSixth = { 1.0f/6.0f, 1.0f/6.0f, 1.0f/6.0f, 1.0f/6.0f }; + static const XMVECTORF32 twoThirds = { 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f }; + + XMVECTOR t = h; + + if ( XMVector3Less( t, g_XMZero ) ) + t = XMVectorAdd( t, g_XMOne ); + + if ( XMVector3Greater( t, g_XMOne ) ) + t = XMVectorSubtract( t, g_XMOne ); + + if ( XMVector3Less( t, oneSixth ) ) + { + // p + (q - p) * 6 * t + XMVECTOR t1 = XMVectorSubtract( q, p ); + XMVECTOR t2 = XMVectorMultiply( g_XMSix, t ); + return XMVectorMultiplyAdd( t1, t2, p ); + } + + if ( XMVector3Less( t, g_XMOneHalf ) ) + return q; + + if ( XMVector3Less( t, twoThirds ) ) + { + // p + (q - p) * 6 * (2/3 - t) + XMVECTOR t1 = XMVectorSubtract( q, p ); + XMVECTOR t2 = XMVectorMultiply( g_XMSix, XMVectorSubtract( twoThirds, t ) ); + return XMVectorMultiplyAdd( t1, t2, p ); + } + + return p; +} + +}; // namespace Internal + +inline XMVECTOR XMColorHSLToRGB( FXMVECTOR hsl ) +{ + static const XMVECTORF32 oneThird = { 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f }; + + XMVECTOR s = XMVectorSplatY( hsl ); + XMVECTOR l = XMVectorSplatZ( hsl ); + + if ( XMVector3NearEqual( s, g_XMZero, g_XMEpsilon ) ) + { + // Achromatic + return XMVectorSelect( hsl, l, g_XMSelect1110 ); + } + else + { + XMVECTOR h = XMVectorSplatX( hsl ); + + XMVECTOR q; + if ( XMVector3Less( l, g_XMOneHalf ) ) + { + q = XMVectorMultiply( l, XMVectorAdd ( g_XMOne, s ) ); + } + else + { + q = XMVectorSubtract( XMVectorAdd( l, s ), XMVectorMultiply( l, s ) ); + } + + XMVECTOR p = XMVectorSubtract( XMVectorMultiply( g_XMTwo, l ), q ); + + XMVECTOR r = DirectX::Internal::XMColorHue2Clr( p, q, XMVectorAdd( h, oneThird ) ); + XMVECTOR g = DirectX::Internal::XMColorHue2Clr( p, q, h ); + XMVECTOR b = DirectX::Internal::XMColorHue2Clr( p, q, XMVectorSubtract( h, oneThird ) ); + + XMVECTOR rg = XMVectorSelect( g, r, g_XMSelect1000 ); + XMVECTOR ba = XMVectorSelect( hsl, b, g_XMSelect1110 ); + + return XMVectorSelect( ba, rg, g_XMSelect1100 ); + } +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMColorRGBToHSV( FXMVECTOR rgb ) +{ + XMVECTOR r = XMVectorSplatX( rgb ); + XMVECTOR g = XMVectorSplatY( rgb ); + XMVECTOR b = XMVectorSplatZ( rgb ); + + XMVECTOR min = XMVectorMin( r, XMVectorMin( g, b ) ); + XMVECTOR v = XMVectorMax( r, XMVectorMax( g, b ) ); + + XMVECTOR d = XMVectorSubtract( v, min ); + + XMVECTOR s = ( XMVector3NearEqual( v, g_XMZero, g_XMEpsilon ) ) ? g_XMZero : XMVectorDivide( d, v ); + + if ( XMVector3Less( d, g_XMEpsilon ) ) + { + // Achromatic, assume H of 0 + XMVECTOR hv = XMVectorSelect( v, g_XMZero, g_XMSelect1000 ); + XMVECTOR hva = XMVectorSelect( rgb, hv, g_XMSelect1110 ); + return XMVectorSelect( s, hva, g_XMSelect1011 ); + } + else + { + XMVECTOR h; + + if ( XMVector3Equal( r, v ) ) + { + // Red is max + h = XMVectorDivide( XMVectorSubtract( g, b ), d ); + + if ( XMVector3Less( g, b ) ) + h = XMVectorAdd( h, g_XMSix ); + } + else if ( XMVector3Equal( g, v ) ) + { + // Green is max + h = XMVectorDivide( XMVectorSubtract( b, r ), d ); + h = XMVectorAdd( h, g_XMTwo ); + } + else + { + // Blue is max + h = XMVectorDivide( XMVectorSubtract( r, g ), d ); + h = XMVectorAdd( h, g_XMFour ); + } + + h = XMVectorDivide( h, g_XMSix ); + + XMVECTOR hv = XMVectorSelect( v, h, g_XMSelect1000 ); + XMVECTOR hva = XMVectorSelect( rgb, hv, g_XMSelect1110 ); + return XMVectorSelect( s, hva, g_XMSelect1011 ); + } +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMColorHSVToRGB( FXMVECTOR hsv ) +{ + XMVECTOR h = XMVectorSplatX( hsv ); + XMVECTOR s = XMVectorSplatY( hsv ); + XMVECTOR v = XMVectorSplatZ( hsv ); + + XMVECTOR h6 = XMVectorMultiply( h, g_XMSix ); + + XMVECTOR i = XMVectorFloor( h6 ); + XMVECTOR f = XMVectorSubtract( h6, i ); + + // p = v* (1-s) + XMVECTOR p = XMVectorMultiply( v, XMVectorSubtract( g_XMOne, s ) ); + + // q = v*(1-f*s) + XMVECTOR q = XMVectorMultiply( v, XMVectorSubtract( g_XMOne, XMVectorMultiply( f, s ) ) ); + + // t = v*(1 - (1-f)*s) + XMVECTOR t = XMVectorMultiply( v, XMVectorSubtract( g_XMOne, XMVectorMultiply( XMVectorSubtract( g_XMOne, f ), s ) ) ); + + int ii = static_cast<int>( XMVectorGetX( XMVectorMod( i, g_XMSix ) ) ); + + XMVECTOR _rgb; + + switch (ii) + { + case 0: // rgb = vtp + { + XMVECTOR vt = XMVectorSelect( t, v, g_XMSelect1000 ); + _rgb = XMVectorSelect( p, vt, g_XMSelect1100 ); + } + break; + case 1: // rgb = qvp + { + XMVECTOR qv = XMVectorSelect( v, q, g_XMSelect1000 ); + _rgb = XMVectorSelect( p, qv, g_XMSelect1100 ); + } + break; + case 2: // rgb = pvt + { + XMVECTOR pv = XMVectorSelect( v, p, g_XMSelect1000 ); + _rgb = XMVectorSelect( t, pv, g_XMSelect1100 ); + } + break; + case 3: // rgb = pqv + { + XMVECTOR pq = XMVectorSelect( q, p, g_XMSelect1000 ); + _rgb = XMVectorSelect( v, pq, g_XMSelect1100 ); + } + break; + case 4: // rgb = tpv + { + XMVECTOR tp = XMVectorSelect( p, t, g_XMSelect1000 ); + _rgb = XMVectorSelect( v, tp, g_XMSelect1100 ); + } + break; + default: // rgb = vpq + { + XMVECTOR vp = XMVectorSelect( p, v, g_XMSelect1000 ); + _rgb = XMVectorSelect( q, vp, g_XMSelect1100 ); + } + break; + } + + return XMVectorSelect( hsv, _rgb, g_XMSelect1110 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMColorRGBToYUV( FXMVECTOR rgb ) +{ + static const XMVECTORF32 Scale0 = { 0.299f, -0.147f, 0.615f, 0.0f }; + static const XMVECTORF32 Scale1 = { 0.587f, -0.289f, -0.515f, 0.0f }; + static const XMVECTORF32 Scale2 = { 0.114f, 0.436f, -0.100f, 0.0f }; + + XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero ); + XMVECTOR clr = XMVector3Transform( rgb, M ); + + return XMVectorSelect( rgb, clr, g_XMSelect1110 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMColorYUVToRGB( FXMVECTOR yuv ) +{ + static const XMVECTORF32 Scale1 = { 0.0f, -0.395f, 2.032f, 0.0f }; + static const XMVECTORF32 Scale2 = { 1.140f, -0.581f, 0.0f, 0.0f }; + + XMMATRIX M( g_XMOne, Scale1, Scale2, g_XMZero ); + XMVECTOR clr = XMVector3Transform( yuv, M ); + + return XMVectorSelect( yuv, clr, g_XMSelect1110 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMColorRGBToYUV_HD( FXMVECTOR rgb ) +{ + static const XMVECTORF32 Scale0 = { 0.2126f, -0.0997f, 0.6150f, 0.0f }; + static const XMVECTORF32 Scale1 = { 0.7152f, -0.3354f, -0.5586f, 0.0f }; + static const XMVECTORF32 Scale2 = { 0.0722f, 0.4351f, -0.0564f, 0.0f }; + + XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero ); + XMVECTOR clr = XMVector3Transform( rgb, M ); + + return XMVectorSelect( rgb, clr, g_XMSelect1110 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMColorYUVToRGB_HD( FXMVECTOR yuv ) +{ + static const XMVECTORF32 Scale1 = { 0.0f, -0.2153f, 2.1324f, 0.0f }; + static const XMVECTORF32 Scale2 = { 1.2803f, -0.3806f, 0.0f, 0.0f }; + + XMMATRIX M( g_XMOne, Scale1, Scale2, g_XMZero ); + XMVECTOR clr = XMVector3Transform( yuv, M ); + + return XMVectorSelect( yuv, clr, g_XMSelect1110 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMColorRGBToXYZ( FXMVECTOR rgb ) +{ + static const XMVECTORF32 Scale0 = { 0.4887180f, 0.1762044f, 0.0000000f, 0.0f }; + static const XMVECTORF32 Scale1 = { 0.3106803f, 0.8129847f, 0.0102048f, 0.0f }; + static const XMVECTORF32 Scale2 = { 0.2006017f, 0.0108109f, 0.9897952f, 0.0f }; + static const XMVECTORF32 Scale = { 1.f/0.17697f, 1.f/0.17697f, 1.f/0.17697f, 0.0f }; + + XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero ); + XMVECTOR clr = XMVectorMultiply( XMVector3Transform( rgb, M ), Scale ); + + return XMVectorSelect( rgb, clr, g_XMSelect1110 ); +} + +inline XMVECTOR XMColorXYZToRGB( FXMVECTOR xyz ) +{ + static const XMVECTORF32 Scale0 = { 2.3706743f, -0.5138850f, 0.0052982f, 0.0f }; + static const XMVECTORF32 Scale1 = { -0.9000405f, 1.4253036f, -0.0146949f, 0.0f }; + static const XMVECTORF32 Scale2 = { -0.4706338f, 0.0885814f, 1.0093968f, 0.0f }; + static const XMVECTORF32 Scale = { 0.17697f, 0.17697f, 0.17697f, 0.0f }; + + XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero ); + XMVECTOR clr = XMVector3Transform( XMVectorMultiply( xyz, Scale ), M ); + + return XMVectorSelect( xyz, clr, g_XMSelect1110 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMColorXYZToSRGB( FXMVECTOR xyz ) +{ + static const XMVECTORF32 Scale0 = { 3.2406f, -0.9689f, 0.0557f, 0.0f }; + static const XMVECTORF32 Scale1 = { -1.5372f, 1.8758f, -0.2040f, 0.0f }; + static const XMVECTORF32 Scale2 = { -0.4986f, 0.0415f, 1.0570f, 0.0f }; + static const XMVECTORF32 Cutoff = { 0.0031308f, 0.0031308f, 0.0031308f, 0.0f }; + static const XMVECTORF32 Exp = { 1.0f/2.4f, 1.0f/2.4f, 1.0f/2.4f, 1.0f }; + + XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero ); + XMVECTOR lclr = XMVector3Transform( xyz, M ); + + XMVECTOR sel = XMVectorGreater( lclr, Cutoff ); + + // clr = 12.92 * lclr for lclr <= 0.0031308f + XMVECTOR smallC = XMVectorMultiply( lclr, g_XMsrgbScale ); + + // clr = (1+a)*pow(lclr, 1/2.4) - a for lclr > 0.0031308 (where a = 0.055) + XMVECTOR largeC = XMVectorSubtract( XMVectorMultiply( g_XMsrgbA1, XMVectorPow( lclr, Exp ) ), g_XMsrgbA ); + + XMVECTOR clr = XMVectorSelect( smallC, largeC, sel ); + + return XMVectorSelect( xyz, clr, g_XMSelect1110 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMColorSRGBToXYZ( FXMVECTOR srgb ) +{ + static const XMVECTORF32 Scale0 = { 0.4124f, 0.2126f, 0.0193f, 0.0f }; + static const XMVECTORF32 Scale1 = { 0.3576f, 0.7152f, 0.1192f, 0.0f }; + static const XMVECTORF32 Scale2 = { 0.1805f, 0.0722f, 0.9505f, 0.0f }; + static const XMVECTORF32 Cutoff = { 0.04045f, 0.04045f, 0.04045f, 0.0f }; + static const XMVECTORF32 Exp = { 2.4f, 2.4f, 2.4f, 1.0f }; + + XMVECTOR sel = XMVectorGreater( srgb, Cutoff ); + + // lclr = clr / 12.92 + XMVECTOR smallC = XMVectorDivide( srgb, g_XMsrgbScale ); + + // lclr = pow( (clr + a) / (1+a), 2.4 ) + XMVECTOR largeC = XMVectorPow( XMVectorDivide( XMVectorAdd( srgb, g_XMsrgbA ), g_XMsrgbA1 ), Exp ); + + XMVECTOR lclr = XMVectorSelect( smallC, largeC, sel ); + + XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero ); + XMVECTOR clr = XMVector3Transform( lclr, M ); + + return XMVectorSelect( srgb, clr, g_XMSelect1110 ); +} + +/**************************************************************************** + * + * Miscellaneous + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline bool XMVerifyCPUSupport() +{ +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +#if defined(_M_AMD64) + // The X64 processor model requires SSE2 support + return true; +#elif defined(PF_XMMI_INSTRUCTIONS_AVAILABLE) + // Note that on Windows 2000 or older, SSE2 detection is not supported so this will always fail + // Detecting SSE2 on older versions of Windows would require using cpuid directly + return ( IsProcessorFeaturePresent( PF_XMMI_INSTRUCTIONS_AVAILABLE ) != 0 && IsProcessorFeaturePresent( PF_XMMI64_INSTRUCTIONS_AVAILABLE ) != 0 ); +#else + // If windows.h is not included, we return false (likely a false negative) + return false; +#endif +#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +#ifdef PF_ARM_NEON_INSTRUCTIONS_AVAILABLE + return ( IsProcessorFeaturePresent( PF_ARM_NEON_INSTRUCTIONS_AVAILABLE ) != 0 ); +#else + // If windows.h is not included, we return false (likely a false negative) + return false; +#endif +#else + return true; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMFresnelTerm +( + FXMVECTOR CosIncidentAngle, + FXMVECTOR RefractionIndex +) +{ + assert(!XMVector4IsInfinite(CosIncidentAngle)); + + // Result = 0.5f * (g - c)^2 / (g + c)^2 * ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1) where + // c = CosIncidentAngle + // g = sqrt(c^2 + RefractionIndex^2 - 1) + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR G = XMVectorMultiplyAdd(RefractionIndex, RefractionIndex, g_XMNegativeOne.v); + G = XMVectorMultiplyAdd(CosIncidentAngle, CosIncidentAngle, G); + G = XMVectorAbs(G); + G = XMVectorSqrt(G); + + XMVECTOR S = XMVectorAdd(G, CosIncidentAngle); + XMVECTOR D = XMVectorSubtract(G, CosIncidentAngle); + + XMVECTOR V0 = XMVectorMultiply(D, D); + XMVECTOR V1 = XMVectorMultiply(S, S); + V1 = XMVectorReciprocal(V1); + V0 = XMVectorMultiply(g_XMOneHalf.v, V0); + V0 = XMVectorMultiply(V0, V1); + + XMVECTOR V2 = XMVectorMultiplyAdd(CosIncidentAngle, S, g_XMNegativeOne.v); + XMVECTOR V3 = XMVectorMultiplyAdd(CosIncidentAngle, D, g_XMOne.v); + V2 = XMVectorMultiply(V2, V2); + V3 = XMVectorMultiply(V3, V3); + V3 = XMVectorReciprocal(V3); + V2 = XMVectorMultiplyAdd(V2, V3, g_XMOne.v); + + XMVECTOR Result = XMVectorMultiply(V0, V2); + + Result = XMVectorSaturate(Result); + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + // G = sqrt(abs((RefractionIndex^2-1) + CosIncidentAngle^2)) + XMVECTOR G = _mm_mul_ps(RefractionIndex,RefractionIndex); + XMVECTOR vTemp = _mm_mul_ps(CosIncidentAngle,CosIncidentAngle); + G = _mm_sub_ps(G,g_XMOne); + vTemp = _mm_add_ps(vTemp,G); + // max((0-vTemp),vTemp) == abs(vTemp) + // The abs is needed to deal with refraction and cosine being zero + G = _mm_setzero_ps(); + G = _mm_sub_ps(G,vTemp); + G = _mm_max_ps(G,vTemp); + // Last operation, the sqrt() + G = _mm_sqrt_ps(G); + + // Calc G-C and G+C + XMVECTOR GAddC = _mm_add_ps(G,CosIncidentAngle); + XMVECTOR GSubC = _mm_sub_ps(G,CosIncidentAngle); + // Perform the term (0.5f *(g - c)^2) / (g + c)^2 + XMVECTOR vResult = _mm_mul_ps(GSubC,GSubC); + vTemp = _mm_mul_ps(GAddC,GAddC); + vResult = _mm_mul_ps(vResult,g_XMOneHalf); + vResult = _mm_div_ps(vResult,vTemp); + // Perform the term ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1) + GAddC = _mm_mul_ps(GAddC,CosIncidentAngle); + GSubC = _mm_mul_ps(GSubC,CosIncidentAngle); + GAddC = _mm_sub_ps(GAddC,g_XMOne); + GSubC = _mm_add_ps(GSubC,g_XMOne); + GAddC = _mm_mul_ps(GAddC,GAddC); + GSubC = _mm_mul_ps(GSubC,GSubC); + GAddC = _mm_div_ps(GAddC,GSubC); + GAddC = _mm_add_ps(GAddC,g_XMOne); + // Multiply the two term parts + vResult = _mm_mul_ps(vResult,GAddC); + // Clamp to 0.0 - 1.0f + vResult = _mm_max_ps(vResult,g_XMZero); + vResult = _mm_min_ps(vResult,g_XMOne); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline bool XMScalarNearEqual +( + float S1, + float S2, + float Epsilon +) +{ + float Delta = S1 - S2; + return (fabsf(Delta) <= Epsilon); +} + +//------------------------------------------------------------------------------ +// Modulo the range of the given angle such that -XM_PI <= Angle < XM_PI +inline float XMScalarModAngle +( + float Angle +) +{ + // Note: The modulo is performed with unsigned math only to work + // around a precision error on numbers that are close to PI + + // Normalize the range from 0.0f to XM_2PI + Angle = Angle + XM_PI; + // Perform the modulo, unsigned + float fTemp = fabsf(Angle); + fTemp = fTemp - (XM_2PI * (float)((int32_t)(fTemp/XM_2PI))); + // Restore the number to the range of -XM_PI to XM_PI-epsilon + fTemp = fTemp - XM_PI; + // If the modulo'd value was negative, restore negation + if (Angle<0.0f) { + fTemp = -fTemp; + } + return fTemp; +} + +//------------------------------------------------------------------------------ + +inline float XMScalarSin +( + float Value +) +{ + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI*Value; + if (Value >= 0.0f) + { + quotient = (float)((int)(quotient + 0.5f)); + } + else + { + quotient = (float)((int)(quotient - 0.5f)); + } + float y = Value - XM_2PI*quotient; + + // Map y to [-pi/2,pi/2] with sin(y) = sin(Value). + if (y > XM_PIDIV2) + { + y = XM_PI - y; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + } + + // 11-degree minimax approximation + float y2 = y * y; + return ( ( ( ( (-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f ) * y2 + 0.0083333310f ) * y2 - 0.16666667f ) * y2 + 1.0f ) * y; +} + +//------------------------------------------------------------------------------ + +inline float XMScalarSinEst +( + float Value +) +{ + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI*Value; + if (Value >= 0.0f) + { + quotient = (float)((int)(quotient + 0.5f)); + } + else + { + quotient = (float)((int)(quotient - 0.5f)); + } + float y = Value - XM_2PI*quotient; + + // Map y to [-pi/2,pi/2] with sin(y) = sin(Value). + if (y > XM_PIDIV2) + { + y = XM_PI - y; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + } + + // 7-degree minimax approximation + float y2 = y * y; + return ( ( ( -0.00018524670f * y2 + 0.0083139502f ) * y2 - 0.16665852f ) * y2 + 1.0f ) * y; +} + +//------------------------------------------------------------------------------ + +inline float XMScalarCos +( + float Value +) +{ + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI*Value; + if (Value >= 0.0f) + { + quotient = (float)((int)(quotient + 0.5f)); + } + else + { + quotient = (float)((int)(quotient - 0.5f)); + } + float y = Value - XM_2PI*quotient; + + // Map y to [-pi/2,pi/2] with cos(y) = sign*cos(x). + float sign; + if (y > XM_PIDIV2) + { + y = XM_PI - y; + sign = -1.0f; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + sign = -1.0f; + } + else + { + sign = +1.0f; + } + + // 10-degree minimax approximation + float y2 = y*y; + float p = ( ( ( ( -2.6051615e-07f * y2 + 2.4760495e-05f ) * y2 - 0.0013888378f ) * y2 + 0.041666638f ) * y2 - 0.5f ) * y2 + 1.0f; + return sign*p; +} + +//------------------------------------------------------------------------------ + +inline float XMScalarCosEst +( + float Value +) +{ + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI*Value; + if (Value >= 0.0f) + { + quotient = (float)((int)(quotient + 0.5f)); + } + else + { + quotient = (float)((int)(quotient - 0.5f)); + } + float y = Value - XM_2PI*quotient; + + // Map y to [-pi/2,pi/2] with cos(y) = sign*cos(x). + float sign; + if (y > XM_PIDIV2) + { + y = XM_PI - y; + sign = -1.0f; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + sign = -1.0f; + } + else + { + sign = +1.0f; + } + + // 6-degree minimax approximation + float y2 = y * y; + float p = ( ( -0.0012712436f * y2 + 0.041493919f ) * y2 - 0.49992746f ) * y2 + 1.0f; + return sign*p; +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline void XMScalarSinCos +( + float* pSin, + float* pCos, + float Value +) +{ + assert(pSin); + assert(pCos); + + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI*Value; + if (Value >= 0.0f) + { + quotient = (float)((int)(quotient + 0.5f)); + } + else + { + quotient = (float)((int)(quotient - 0.5f)); + } + float y = Value - XM_2PI*quotient; + + // Map y to [-pi/2,pi/2] with sin(y) = sin(Value). + float sign; + if (y > XM_PIDIV2) + { + y = XM_PI - y; + sign = -1.0f; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + sign = -1.0f; + } + else + { + sign = +1.0f; + } + + float y2 = y * y; + + // 11-degree minimax approximation + *pSin = ( ( ( ( (-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f ) * y2 + 0.0083333310f ) * y2 - 0.16666667f ) * y2 + 1.0f ) * y; + + // 10-degree minimax approximation + float p = ( ( ( ( -2.6051615e-07f * y2 + 2.4760495e-05f ) * y2 - 0.0013888378f ) * y2 + 0.041666638f ) * y2 - 0.5f ) * y2 + 1.0f; + *pCos = sign*p; +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline void XMScalarSinCosEst +( + float* pSin, + float* pCos, + float Value +) +{ + assert(pSin); + assert(pCos); + + // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder. + float quotient = XM_1DIV2PI*Value; + if (Value >= 0.0f) + { + quotient = (float)((int)(quotient + 0.5f)); + } + else + { + quotient = (float)((int)(quotient - 0.5f)); + } + float y = Value - XM_2PI*quotient; + + // Map y to [-pi/2,pi/2] with sin(y) = sin(Value). + float sign; + if (y > XM_PIDIV2) + { + y = XM_PI - y; + sign = -1.0f; + } + else if (y < -XM_PIDIV2) + { + y = -XM_PI - y; + sign = -1.0f; + } + else + { + sign = +1.0f; + } + + float y2 = y * y; + + // 7-degree minimax approximation + *pSin = ( ( ( -0.00018524670f * y2 + 0.0083139502f ) * y2 - 0.16665852f ) * y2 + 1.0f ) * y; + + // 6-degree minimax approximation + float p = ( ( -0.0012712436f * y2 + 0.041493919f ) * y2 - 0.49992746f ) * y2 + 1.0f; + *pCos = sign*p; +} + +//------------------------------------------------------------------------------ + +inline float XMScalarASin +( + float Value +) +{ + // Clamp input to [-1,1]. + bool nonnegative = (Value >= 0.0f); + float x = fabsf(Value); + float omx = 1.0f - x; + if (omx < 0.0f) + { + omx = 0.0f; + } + float root = sqrt(omx); + + // 7-degree minimax approximation + float result = ( ( ( ( ( ( -0.0012624911f * x + 0.0066700901f ) * x - 0.0170881256f ) * x + 0.0308918810f ) * x - 0.0501743046f ) * x + 0.0889789874f ) * x - 0.2145988016f ) * x + 1.5707963050f; + result *= root; // acos(|x|) + + // acos(x) = pi - acos(-x) when x < 0, asin(x) = pi/2 - acos(x) + return (nonnegative ? XM_PIDIV2 - result : result - XM_PIDIV2); +} + +//------------------------------------------------------------------------------ + +inline float XMScalarASinEst +( + float Value +) +{ + // Clamp input to [-1,1]. + bool nonnegative = (Value >= 0.0f); + float x = fabsf(Value); + float omx = 1.0f - x; + if (omx < 0.0f) + { + omx = 0.0f; + } + float root = sqrt(omx); + + // 3-degree minimax approximation + float result = ((-0.0187293f*x+0.0742610f)*x-0.2121144f)*x+1.5707288f; + result *= root; // acos(|x|) + + // acos(x) = pi - acos(-x) when x < 0, asin(x) = pi/2 - acos(x) + return (nonnegative ? XM_PIDIV2 - result : result - XM_PIDIV2); +} + +//------------------------------------------------------------------------------ + +inline float XMScalarACos +( + float Value +) +{ + // Clamp input to [-1,1]. + bool nonnegative = (Value >= 0.0f); + float x = fabsf(Value); + float omx = 1.0f - x; + if (omx < 0.0f) + { + omx = 0.0f; + } + float root = sqrtf(omx); + + // 7-degree minimax approximation + float result = ( ( ( ( ( ( -0.0012624911f * x + 0.0066700901f ) * x - 0.0170881256f ) * x + 0.0308918810f ) * x - 0.0501743046f ) * x + 0.0889789874f ) * x - 0.2145988016f ) * x + 1.5707963050f; + result *= root; + + // acos(x) = pi - acos(-x) when x < 0 + return (nonnegative ? result : XM_PI - result); +} + +//------------------------------------------------------------------------------ + +inline float XMScalarACosEst +( + float Value +) +{ + // Clamp input to [-1,1]. + bool nonnegative = (Value >= 0.0f); + float x = fabsf(Value); + float omx = 1.0f - x; + if (omx < 0.0f) + { + omx = 0.0f; + } + float root = sqrtf(omx); + + // 3-degree minimax approximation + float result = ( ( -0.0187293f * x + 0.0742610f ) * x - 0.2121144f ) * x + 1.5707288f; + result *= root; + + // acos(x) = pi - acos(-x) when x < 0 + return (nonnegative ? result : XM_PI - result); +} + diff --git a/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXMathVector.inl b/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXMathVector.inl new file mode 100644 index 00000000..39e24055 --- /dev/null +++ b/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXMathVector.inl @@ -0,0 +1,10596 @@ +//------------------------------------------------------------------------------------- +// DirectXMathVector.inl -- SIMD C++ Math library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------------------------------- + +#ifdef _MSC_VER +#pragma once +#endif + +#if defined(_XM_NO_INTRINSICS_) +#define XMISNAN(x) ((*(uint32_t*)&(x) & 0x7F800000) == 0x7F800000 && (*(uint32_t*)&(x) & 0x7FFFFF) != 0) +#define XMISINF(x) ((*(uint32_t*)&(x) & 0x7FFFFFFF) == 0x7F800000) +#endif + +/**************************************************************************** + * + * General Vector + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Assignment operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// Return a vector with all elements equaling zero +inline XMVECTOR XMVectorZero() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult = {0.0f,0.0f,0.0f,0.0f}; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_u32(0); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_setzero_ps(); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Initialize a vector with four floating point values +inline XMVECTOR XMVectorSet +( + float x, + float y, + float z, + float w +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = {x,y,z,w}; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 V0 = vcreate_f32(((uint64_t)*(const uint32_t *)&x) | ((uint64_t)(*(const uint32_t *)&y) << 32)); + __n64 V1 = vcreate_f32(((uint64_t)*(const uint32_t *)&z) | ((uint64_t)(*(const uint32_t *)&w) << 32)); + return vcombine_f32(V0, V1); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_set_ps( w, z, y, x ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Initialize a vector with four integer values +inline XMVECTOR XMVectorSetInt +( + uint32_t x, + uint32_t y, + uint32_t z, + uint32_t w +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult = {x,y,z,w}; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 V0 = vcreate_u32(((uint64_t)x) | ((uint64_t)y << 32)); + __n64 V1 = vcreate_u32(((uint64_t)z) | ((uint64_t)w << 32)); + return vcombine_u32(V0, V1); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_set_epi32( w, z, y, x ); + return reinterpret_cast<__m128 *>(&V)[0]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Initialize a vector with a replicated floating point value +inline XMVECTOR XMVectorReplicate +( + float Value +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) + XMVECTORF32 vResult = {Value,Value,Value,Value}; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_f32( Value ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_set_ps1( Value ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Initialize a vector with a replicated floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XMVectorReplicatePtr +( + const float *pValue +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) + float Value = pValue[0]; + XMVECTORF32 vResult = {Value,Value,Value,Value}; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_dup_f32( pValue ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ps1( pValue ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Initialize a vector with a replicated integer value +inline XMVECTOR XMVectorReplicateInt +( + uint32_t Value +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) + XMVECTORU32 vResult = {Value,Value,Value,Value}; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_u32( Value ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_set1_epi32( Value ); + return _mm_castsi128_ps(vTemp); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Initialize a vector with a replicated integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XMVectorReplicateIntPtr +( + const uint32_t *pValue +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) + uint32_t Value = pValue[0]; + XMVECTORU32 vResult = {Value,Value,Value,Value}; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_dup_u32(pValue); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ps1(reinterpret_cast<const float *>(pValue)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Initialize a vector with all bits set (true mask) +inline XMVECTOR XMVectorTrueInt() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTORU32 vResult = {0xFFFFFFFFU,0xFFFFFFFFU,0xFFFFFFFFU,0xFFFFFFFFU}; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_s32(-1); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_set1_epi32(-1); + return reinterpret_cast<__m128 *>(&V)[0]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Initialize a vector with all bits clear (false mask) +inline XMVECTOR XMVectorFalseInt() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult = {0.0f,0.0f,0.0f,0.0f}; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_u32(0); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_setzero_ps(); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Replicate the x component of the vector +inline XMVECTOR XMVectorSplatX +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult; + vResult.vector4_f32[0] = + vResult.vector4_f32[1] = + vResult.vector4_f32[2] = + vResult.vector4_f32[3] = V.vector4_f32[0]; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_lane_f32( vget_low_f32( V ), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + return XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Replicate the y component of the vector +inline XMVECTOR XMVectorSplatY +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult; + vResult.vector4_f32[0] = + vResult.vector4_f32[1] = + vResult.vector4_f32[2] = + vResult.vector4_f32[3] = V.vector4_f32[1]; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_lane_f32( vget_low_f32( V ), 1 ); +#elif defined(_XM_SSE_INTRINSICS_) + return XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Replicate the z component of the vector +inline XMVECTOR XMVectorSplatZ +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult; + vResult.vector4_f32[0] = + vResult.vector4_f32[1] = + vResult.vector4_f32[2] = + vResult.vector4_f32[3] = V.vector4_f32[2]; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_lane_f32( vget_high_f32( V ), 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + return XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Replicate the w component of the vector +inline XMVECTOR XMVectorSplatW +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult; + vResult.vector4_f32[0] = + vResult.vector4_f32[1] = + vResult.vector4_f32[2] = + vResult.vector4_f32[3] = V.vector4_f32[3]; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_lane_f32( vget_high_f32( V ), 1 ); +#elif defined(_XM_SSE_INTRINSICS_) + return XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Return a vector of 1.0f,1.0f,1.0f,1.0f +inline XMVECTOR XMVectorSplatOne() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult; + vResult.vector4_f32[0] = + vResult.vector4_f32[1] = + vResult.vector4_f32[2] = + vResult.vector4_f32[3] = 1.0f; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_f32(1.0f); +#elif defined(_XM_SSE_INTRINSICS_) + return g_XMOne; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Return a vector of INF,INF,INF,INF +inline XMVECTOR XMVectorSplatInfinity() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult; + vResult.vector4_u32[0] = + vResult.vector4_u32[1] = + vResult.vector4_u32[2] = + vResult.vector4_u32[3] = 0x7F800000; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_u32(0x7F800000); +#elif defined(_XM_SSE_INTRINSICS_) + return g_XMInfinity; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Return a vector of Q_NAN,Q_NAN,Q_NAN,Q_NAN +inline XMVECTOR XMVectorSplatQNaN() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult; + vResult.vector4_u32[0] = + vResult.vector4_u32[1] = + vResult.vector4_u32[2] = + vResult.vector4_u32[3] = 0x7FC00000; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_u32(0x7FC00000); +#elif defined(_XM_SSE_INTRINSICS_) + return g_XMQNaN; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Return a vector of 1.192092896e-7f,1.192092896e-7f,1.192092896e-7f,1.192092896e-7f +inline XMVECTOR XMVectorSplatEpsilon() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult; + vResult.vector4_u32[0] = + vResult.vector4_u32[1] = + vResult.vector4_u32[2] = + vResult.vector4_u32[3] = 0x34000000; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_u32(0x34000000); +#elif defined(_XM_SSE_INTRINSICS_) + return g_XMEpsilon; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Return a vector of -0.0f (0x80000000),-0.0f,-0.0f,-0.0f +inline XMVECTOR XMVectorSplatSignMask() +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult; + vResult.vector4_u32[0] = + vResult.vector4_u32[1] = + vResult.vector4_u32[2] = + vResult.vector4_u32[3] = 0x80000000U; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vdupq_n_u32(0x80000000U); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_set1_epi32( 0x80000000 ); + return reinterpret_cast<__m128*>(&V)[0]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Return a floating point value via an index. This is not a recommended +// function to use due to performance loss. +inline float XMVectorGetByIndex(FXMVECTOR V, size_t i) +{ + assert( i < 4 ); + _Analysis_assume_( i < 4 ); +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[i]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return V.n128_f32[i]; +#elif defined(_XM_SSE_INTRINSICS_) + return V.m128_f32[i]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Return the X component in an FPU register. +inline float XMVectorGetX(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[0]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_f32(V, 0); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cvtss_f32(V); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Return the Y component in an FPU register. +inline float XMVectorGetY(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_f32(V, 1); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + return _mm_cvtss_f32(vTemp); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Return the Z component in an FPU register. +inline float XMVectorGetZ(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_f32(V, 2); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + return _mm_cvtss_f32(vTemp); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Return the W component in an FPU register. +inline float XMVectorGetW(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_f32(V, 3); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); + return _mm_cvtss_f32(vTemp); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Store a component indexed by i into a 32 bit float location in memory. +_Use_decl_annotations_ +inline void XMVectorGetByIndexPtr(float *f, FXMVECTOR V, size_t i) +{ + assert( f != NULL ); + assert( i < 4 ); + _Analysis_assume_( i < 4 ); +#if defined(_XM_NO_INTRINSICS_) + *f = V.vector4_f32[i]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + *f = V.n128_f32[i]; +#elif defined(_XM_SSE_INTRINSICS_) + *f = V.m128_f32[i]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Store the X component into a 32 bit float location in memory. +_Use_decl_annotations_ +inline void XMVectorGetXPtr(float *x, FXMVECTOR V) +{ + assert( x != NULL); +#if defined(_XM_NO_INTRINSICS_) + *x = V.vector4_f32[0]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32(x,V,0); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ss(x,V); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Store the Y component into a 32 bit float location in memory. +_Use_decl_annotations_ +inline void XMVectorGetYPtr(float *y, FXMVECTOR V) +{ + assert( y != NULL ); +#if defined(_XM_NO_INTRINSICS_) + *y = V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32(y,V,1); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + _mm_store_ss(y,vResult); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Store the Z component into a 32 bit float location in memory. +_Use_decl_annotations_ +inline void XMVectorGetZPtr(float *z, FXMVECTOR V) +{ + assert( z != NULL ); +#if defined(_XM_NO_INTRINSICS_) + *z = V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32(z,V,2); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss(z,vResult); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Store the W component into a 32 bit float location in memory. +_Use_decl_annotations_ +inline void XMVectorGetWPtr(float *w, FXMVECTOR V) +{ + assert( w != NULL ); +#if defined(_XM_NO_INTRINSICS_) + *w = V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32(w,V,3); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); + _mm_store_ss(w,vResult); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Return an integer value via an index. This is not a recommended +// function to use due to performance loss. +inline uint32_t XMVectorGetIntByIndex(FXMVECTOR V, size_t i) +{ + assert( i < 4 ); + _Analysis_assume_( i < 4 ); +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[i]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return V.n128_u32[i]; +#elif defined(_XM_SSE_INTRINSICS_) + return V.m128_u32[i]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Return the X component in an integer register. +inline uint32_t XMVectorGetIntX(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[0]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_u32(V, 0); +#elif defined(_XM_SSE_INTRINSICS_) + return static_cast<uint32_t>(_mm_cvtsi128_si32(_mm_castps_si128(V))); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Return the Y component in an integer register. +inline uint32_t XMVectorGetIntY(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_u32(V, 1); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(1,1,1,1)); + return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Return the Z component in an integer register. +inline uint32_t XMVectorGetIntZ(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_u32(V, 2); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(2,2,2,2)); + return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Return the W component in an integer register. +inline uint32_t XMVectorGetIntW(FXMVECTOR V) +{ +#if defined(_XM_NO_INTRINSICS_) + return V.vector4_u32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vgetq_lane_u32(V, 3); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(3,3,3,3)); + return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Store a component indexed by i into a 32 bit integer location in memory. +_Use_decl_annotations_ +inline void XMVectorGetIntByIndexPtr(uint32_t *x, FXMVECTOR V, size_t i) +{ + assert( x != NULL ); + assert( i < 4 ); + _Analysis_assume_( i < 4 ); +#if defined(_XM_NO_INTRINSICS_) + *x = V.vector4_u32[i]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + *x = V.n128_u32[i]; +#elif defined(_XM_SSE_INTRINSICS_) + *x = V.m128_u32[i]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Store the X component into a 32 bit integer location in memory. +_Use_decl_annotations_ +inline void XMVectorGetIntXPtr(uint32_t *x, FXMVECTOR V) +{ + assert( x != NULL ); +#if defined(_XM_NO_INTRINSICS_) + *x = V.vector4_u32[0]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32(x,V,0); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ss(reinterpret_cast<float *>(x),V); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Store the Y component into a 32 bit integer location in memory. +_Use_decl_annotations_ +inline void XMVectorGetIntYPtr(uint32_t *y, FXMVECTOR V) +{ + assert( y != NULL ); +#if defined(_XM_NO_INTRINSICS_) + *y = V.vector4_u32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32(y,V,1); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + _mm_store_ss(reinterpret_cast<float *>(y),vResult); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Store the Z component into a 32 bit integer locaCantion in memory. +_Use_decl_annotations_ +inline void XMVectorGetIntZPtr(uint32_t *z, FXMVECTOR V) +{ + assert( z != NULL ); +#if defined(_XM_NO_INTRINSICS_) + *z = V.vector4_u32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32(z,V,2); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss(reinterpret_cast<float *>(z),vResult); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Store the W component into a 32 bit integer location in memory. +_Use_decl_annotations_ +inline void XMVectorGetIntWPtr(uint32_t *w, FXMVECTOR V) +{ + assert( w != NULL ); +#if defined(_XM_NO_INTRINSICS_) + *w = V.vector4_u32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32(w,V,3); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); + _mm_store_ss(reinterpret_cast<float *>(w),vResult); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Set a single indexed floating point component +inline XMVECTOR XMVectorSetByIndex(FXMVECTOR V, float f, size_t i) +{ + assert( i < 4 ); + _Analysis_assume_( i < 4 ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U = V; + U.vector4_f32[i] = f; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR U = V; + U.n128_f32[i] = f; + return U; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR U = V; + U.m128_f32[i] = f; + return U; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Sets the X component of a vector to a passed floating point value +inline XMVECTOR XMVectorSetX(FXMVECTOR V, float x) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_f32[0] = x; + U.vector4_f32[1] = V.vector4_f32[1]; + U.vector4_f32[2] = V.vector4_f32[2]; + U.vector4_f32[3] = V.vector4_f32[3]; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_f32(x,V,0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_set_ss(x); + vResult = _mm_move_ss(V,vResult); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Sets the Y component of a vector to a passed floating point value +inline XMVECTOR XMVectorSetY(FXMVECTOR V, float y) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_f32[0] = V.vector4_f32[0]; + U.vector4_f32[1] = y; + U.vector4_f32[2] = V.vector4_f32[2]; + U.vector4_f32[3] = V.vector4_f32[3]; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_f32(y,V,1); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap y and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1)); + // Convert input to vector + XMVECTOR vTemp = _mm_set_ss(y); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap y and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1)); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} +// Sets the Z component of a vector to a passed floating point value +inline XMVECTOR XMVectorSetZ(FXMVECTOR V, float z) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_f32[0] = V.vector4_f32[0]; + U.vector4_f32[1] = V.vector4_f32[1]; + U.vector4_f32[2] = z; + U.vector4_f32[3] = V.vector4_f32[3]; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_f32(z,V,2); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap z and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2)); + // Convert input to vector + XMVECTOR vTemp = _mm_set_ss(z); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap z and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2)); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Sets the W component of a vector to a passed floating point value +inline XMVECTOR XMVectorSetW(FXMVECTOR V, float w) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_f32[0] = V.vector4_f32[0]; + U.vector4_f32[1] = V.vector4_f32[1]; + U.vector4_f32[2] = V.vector4_f32[2]; + U.vector4_f32[3] = w; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_f32(w,V,3); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap w and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3)); + // Convert input to vector + XMVECTOR vTemp = _mm_set_ss(w); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap w and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3)); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Sets a component of a vector to a floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XMVectorSetByIndexPtr(FXMVECTOR V, const float *f, size_t i) +{ + assert( f != NULL ); + assert( i < 4 ); + _Analysis_assume_( i < 4 ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U = V; + U.vector4_f32[i] = *f; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR U = V; + U.n128_f32[i] = *f; + return U; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR U = V; + U.m128_f32[i] = *f; + return U; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Sets the X component of a vector to a floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XMVectorSetXPtr(FXMVECTOR V, const float *x) +{ + assert( x != NULL ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_f32[0] = *x; + U.vector4_f32[1] = V.vector4_f32[1]; + U.vector4_f32[2] = V.vector4_f32[2]; + U.vector4_f32[3] = V.vector4_f32[3]; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_f32(x,V,0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_load_ss(x); + vResult = _mm_move_ss(V,vResult); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Sets the Y component of a vector to a floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XMVectorSetYPtr(FXMVECTOR V, const float *y) +{ + assert( y != NULL ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_f32[0] = V.vector4_f32[0]; + U.vector4_f32[1] = *y; + U.vector4_f32[2] = V.vector4_f32[2]; + U.vector4_f32[3] = V.vector4_f32[3]; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_f32(y,V,1); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap y and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(y); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap y and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1)); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Sets the Z component of a vector to a floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XMVectorSetZPtr(FXMVECTOR V, const float *z) +{ + assert( z != NULL ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_f32[0] = V.vector4_f32[0]; + U.vector4_f32[1] = V.vector4_f32[1]; + U.vector4_f32[2] = *z; + U.vector4_f32[3] = V.vector4_f32[3]; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_f32(z,V,2); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap z and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(z); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap z and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2)); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Sets the W component of a vector to a floating point value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XMVectorSetWPtr(FXMVECTOR V, const float *w) +{ + assert( w != NULL ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_f32[0] = V.vector4_f32[0]; + U.vector4_f32[1] = V.vector4_f32[1]; + U.vector4_f32[2] = V.vector4_f32[2]; + U.vector4_f32[3] = *w; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_f32(w,V,3); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap w and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(w); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap w and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3)); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Sets a component of a vector to an integer passed by value +inline XMVECTOR XMVectorSetIntByIndex(FXMVECTOR V, uint32_t x, size_t i) +{ + assert( i < 4 ); + _Analysis_assume_( i < 4 ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U = V; + U.vector4_u32[i] = x; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTORU32 tmp; + tmp.v = V; + tmp.u[i] = x; + return tmp; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTORU32 tmp; + tmp.v = V; + tmp.u[i] = x; + return tmp; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Sets the X component of a vector to an integer passed by value +inline XMVECTOR XMVectorSetIntX(FXMVECTOR V, uint32_t x) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_u32[0] = x; + U.vector4_u32[1] = V.vector4_u32[1]; + U.vector4_u32[2] = V.vector4_u32[2]; + U.vector4_u32[3] = V.vector4_u32[3]; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_u32(x,V,0); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cvtsi32_si128(x); + XMVECTOR vResult = _mm_move_ss(V,_mm_castsi128_ps(vTemp)); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Sets the Y component of a vector to an integer passed by value +inline XMVECTOR XMVectorSetIntY(FXMVECTOR V, uint32_t y) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_u32[0] = V.vector4_u32[0]; + U.vector4_u32[1] = y; + U.vector4_u32[2] = V.vector4_u32[2]; + U.vector4_u32[3] = V.vector4_u32[3]; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_u32(y,V,1); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap y and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1)); + // Convert input to vector + __m128i vTemp = _mm_cvtsi32_si128(y); + // Replace the x component + vResult = _mm_move_ss(vResult,_mm_castsi128_ps(vTemp)); + // Swap y and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1)); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Sets the Z component of a vector to an integer passed by value +inline XMVECTOR XMVectorSetIntZ(FXMVECTOR V, uint32_t z) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_u32[0] = V.vector4_u32[0]; + U.vector4_u32[1] = V.vector4_u32[1]; + U.vector4_u32[2] = z; + U.vector4_u32[3] = V.vector4_u32[3]; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_u32(z,V,2); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap z and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2)); + // Convert input to vector + __m128i vTemp = _mm_cvtsi32_si128(z); + // Replace the x component + vResult = _mm_move_ss(vResult,_mm_castsi128_ps(vTemp)); + // Swap z and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2)); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Sets the W component of a vector to an integer passed by value +inline XMVECTOR XMVectorSetIntW(FXMVECTOR V, uint32_t w) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_u32[0] = V.vector4_u32[0]; + U.vector4_u32[1] = V.vector4_u32[1]; + U.vector4_u32[2] = V.vector4_u32[2]; + U.vector4_u32[3] = w; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsetq_lane_u32(w,V,3); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap w and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3)); + // Convert input to vector + __m128i vTemp = _mm_cvtsi32_si128(w); + // Replace the x component + vResult = _mm_move_ss(vResult,_mm_castsi128_ps(vTemp)); + // Swap w and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3)); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Sets a component of a vector to an integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XMVectorSetIntByIndexPtr(FXMVECTOR V, const uint32_t *x, size_t i) +{ + assert( x != NULL ); + assert( i < 4 ); + _Analysis_assume_( i < 4 ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U = V; + U.vector4_u32[i] = *x; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTORU32 tmp; + tmp.v = V; + tmp.u[i] = *x; + return tmp; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTORU32 tmp; + tmp.v = V; + tmp.u[i] = *x; + return tmp; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +// Sets the X component of a vector to an integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XMVectorSetIntXPtr(FXMVECTOR V, const uint32_t *x) +{ + assert( x != NULL ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_u32[0] = *x; + U.vector4_u32[1] = V.vector4_u32[1]; + U.vector4_u32[2] = V.vector4_u32[2]; + U.vector4_u32[3] = V.vector4_u32[3]; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_u32(x,V,0); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(x)); + XMVECTOR vResult = _mm_move_ss(V,vTemp); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Sets the Y component of a vector to an integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XMVectorSetIntYPtr(FXMVECTOR V, const uint32_t *y) +{ + assert( y != NULL ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_u32[0] = V.vector4_u32[0]; + U.vector4_u32[1] = *y; + U.vector4_u32[2] = V.vector4_u32[2]; + U.vector4_u32[3] = V.vector4_u32[3]; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_u32(y,V,1); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap y and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(y)); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap y and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1)); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Sets the Z component of a vector to an integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XMVectorSetIntZPtr(FXMVECTOR V, const uint32_t *z) +{ + assert( z != NULL ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_u32[0] = V.vector4_u32[0]; + U.vector4_u32[1] = V.vector4_u32[1]; + U.vector4_u32[2] = *z; + U.vector4_u32[3] = V.vector4_u32[3]; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_u32(z,V,2); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap z and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(z)); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap z and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2)); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +// Sets the W component of a vector to an integer value passed by pointer +_Use_decl_annotations_ +inline XMVECTOR XMVectorSetIntWPtr(FXMVECTOR V, const uint32_t *w) +{ + assert( w != NULL ); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR U; + U.vector4_u32[0] = V.vector4_u32[0]; + U.vector4_u32[1] = V.vector4_u32[1]; + U.vector4_u32[2] = V.vector4_u32[2]; + U.vector4_u32[3] = *w; + return U; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_lane_u32(w,V,3); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap w and x + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3)); + // Convert input to vector + XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(w)); + // Replace the x component + vResult = _mm_move_ss(vResult,vTemp); + // Swap w and x again + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3)); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorSwizzle +( + FXMVECTOR V, + uint32_t E0, + uint32_t E1, + uint32_t E2, + uint32_t E3 +) +{ + assert( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) ); + _Analysis_assume_( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) ); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result = { V.vector4_f32[E0], + V.vector4_f32[E1], + V.vector4_f32[E2], + V.vector4_f32[E3] }; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const uint32_t ControlElement[ 4 ] = + { +#ifdef _XM_LITTLEENDIAN_ + 0x03020100, // XM_SWIZZLE_X + 0x07060504, // XM_SWIZZLE_Y + 0x0B0A0908, // XM_SWIZZLE_Z + 0x0F0E0D0C, // XM_SWIZZLE_W +#else + 0x00010203, // XM_SWIZZLE_X + 0x04050607, // XM_SWIZZLE_Y + 0x08090A0B, // XM_SWIZZLE_Z + 0x0C0D0E0F, // XM_SWIZZLE_W +#endif + }; + + int8x8x2_t tbl; + tbl.val[0] = vget_low_f32(V); + tbl.val[1] = vget_high_f32(V); + + __n64 idx = vcreate_u32( ((uint64_t)ControlElement[E0]) | (((uint64_t)ControlElement[E1]) << 32) ); + const __n64 rL = vtbl2_u8( tbl, idx ); + + idx = vcreate_u32( ((uint64_t)ControlElement[E2]) | (((uint64_t)ControlElement[E3]) << 32) ); + const __n64 rH = vtbl2_u8( tbl, idx ); + + return vcombine_f32( rL, rH ); +#elif defined(_XM_VMX128_INTRINSICS_) +#else + const uint32_t *aPtr = (const uint32_t* )(&V); + + XMVECTOR Result; + uint32_t *pWork = (uint32_t*)(&Result); + + pWork[0] = aPtr[E0]; + pWork[1] = aPtr[E1]; + pWork[2] = aPtr[E2]; + pWork[3] = aPtr[E3]; + + return Result; +#endif +} + +//------------------------------------------------------------------------------ +inline XMVECTOR XMVectorPermute +( + FXMVECTOR V1, + FXMVECTOR V2, + uint32_t PermuteX, + uint32_t PermuteY, + uint32_t PermuteZ, + uint32_t PermuteW +) +{ + assert( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 ); + _Analysis_assume_( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 ); + +#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + static const uint32_t ControlElement[ 8 ] = + { +#ifdef _XM_LITTLEENDIAN_ + 0x03020100, // XM_PERMUTE_0X + 0x07060504, // XM_PERMUTE_0Y + 0x0B0A0908, // XM_PERMUTE_0Z + 0x0F0E0D0C, // XM_PERMUTE_0W + 0x13121110, // XM_PERMUTE_1X + 0x17161514, // XM_PERMUTE_1Y + 0x1B1A1918, // XM_PERMUTE_1Z + 0x1F1E1D1C, // XM_PERMUTE_1W +#else + 0x00010203, // XM_PERMUTE_0X + 0x04050607, // XM_PERMUTE_0Y + 0x08090A0B, // XM_PERMUTE_0Z + 0x0C0D0E0F, // XM_PERMUTE_0W + 0x10111213, // XM_PERMUTE_1X + 0x14151617, // XM_PERMUTE_1Y + 0x18191A1B, // XM_PERMUTE_1Z + 0x1C1D1E1F, // XM_PERMUTE_1W +#endif + }; + + int8x8x4_t tbl; + tbl.val[0] = vget_low_f32(V1); + tbl.val[1] = vget_high_f32(V1); + tbl.val[2] = vget_low_f32(V2); + tbl.val[3] = vget_high_f32(V2); + + __n64 idx = vcreate_u32( ((uint64_t)ControlElement[PermuteX]) | (((uint64_t)ControlElement[PermuteY]) << 32) ); + const __n64 rL = vtbl4_u8( tbl, idx ); + + idx = vcreate_u32( ((uint64_t)ControlElement[PermuteZ]) | (((uint64_t)ControlElement[PermuteW]) << 32) ); + const __n64 rH = vtbl4_u8( tbl, idx ); + + return vcombine_f32( rL, rH ); +#elif defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) +#else + + const uint32_t *aPtr[2]; + aPtr[0] = (const uint32_t* )(&V1); + aPtr[1] = (const uint32_t* )(&V2); + + XMVECTOR Result; + uint32_t *pWork = (uint32_t*)(&Result); + + const uint32_t i0 = PermuteX & 3; + const uint32_t vi0 = PermuteX >> 2; + pWork[0] = aPtr[vi0][i0]; + + const uint32_t i1 = PermuteY & 3; + const uint32_t vi1 = PermuteY >> 2; + pWork[1] = aPtr[vi1][i1]; + + const uint32_t i2 = PermuteZ & 3; + const uint32_t vi2 = PermuteZ >> 2; + pWork[2] = aPtr[vi2][i2]; + + const uint32_t i3 = PermuteW & 3; + const uint32_t vi3 = PermuteW >> 2; + pWork[3] = aPtr[vi3][i3]; + + return Result; +#endif +} + +//------------------------------------------------------------------------------ +// Define a control vector to be used in XMVectorSelect +// operations. The four integers specified in XMVectorSelectControl +// serve as indices to select between components in two vectors. +// The first index controls selection for the first component of +// the vectors involved in a select operation, the second index +// controls selection for the second component etc. A value of +// zero for an index causes the corresponding component from the first +// vector to be selected whereas a one causes the component from the +// second vector to be selected instead. + +inline XMVECTOR XMVectorSelectControl +( + uint32_t VectorIndex0, + uint32_t VectorIndex1, + uint32_t VectorIndex2, + uint32_t VectorIndex3 +) +{ +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + // x=Index0,y=Index1,z=Index2,w=Index3 + __m128i vTemp = _mm_set_epi32(VectorIndex3,VectorIndex2,VectorIndex1,VectorIndex0); + // Any non-zero entries become 0xFFFFFFFF else 0 + vTemp = _mm_cmpgt_epi32(vTemp,g_XMZero); + return reinterpret_cast<__m128 *>(&vTemp)[0]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + __n64 V0 = vcreate_s32(((uint64_t)VectorIndex0) | ((uint64_t)VectorIndex1 << 32)); + __n64 V1 = vcreate_s32(((uint64_t)VectorIndex2) | ((uint64_t)VectorIndex3 << 32)); + __n128 vTemp = vcombine_s32(V0, V1); + // Any non-zero entries become 0xFFFFFFFF else 0 + return vcgtq_s32(vTemp,g_XMZero); +#else + XMVECTOR ControlVector; + const uint32_t ControlElement[] = + { + XM_SELECT_0, + XM_SELECT_1 + }; + + assert(VectorIndex0 < 2); + assert(VectorIndex1 < 2); + assert(VectorIndex2 < 2); + assert(VectorIndex3 < 2); + _Analysis_assume_(VectorIndex0 < 2); + _Analysis_assume_(VectorIndex1 < 2); + _Analysis_assume_(VectorIndex2 < 2); + _Analysis_assume_(VectorIndex3 < 2); + + ControlVector.vector4_u32[0] = ControlElement[VectorIndex0]; + ControlVector.vector4_u32[1] = ControlElement[VectorIndex1]; + ControlVector.vector4_u32[2] = ControlElement[VectorIndex2]; + ControlVector.vector4_u32[3] = ControlElement[VectorIndex3]; + + return ControlVector; + +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorSelect +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Control +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_u32[0] = (V1.vector4_u32[0] & ~Control.vector4_u32[0]) | (V2.vector4_u32[0] & Control.vector4_u32[0]); + Result.vector4_u32[1] = (V1.vector4_u32[1] & ~Control.vector4_u32[1]) | (V2.vector4_u32[1] & Control.vector4_u32[1]); + Result.vector4_u32[2] = (V1.vector4_u32[2] & ~Control.vector4_u32[2]) | (V2.vector4_u32[2] & Control.vector4_u32[2]); + Result.vector4_u32[3] = (V1.vector4_u32[3] & ~Control.vector4_u32[3]) | (V2.vector4_u32[3] & Control.vector4_u32[3]); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vbslq_f32( Control, V2, V1 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp1 = _mm_andnot_ps(Control,V1); + XMVECTOR vTemp2 = _mm_and_ps(V2,Control); + return _mm_or_ps(vTemp1,vTemp2); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorMergeXY +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_u32[0] = V1.vector4_u32[0]; + Result.vector4_u32[1] = V2.vector4_u32[0]; + Result.vector4_u32[2] = V1.vector4_u32[1]; + Result.vector4_u32[3] = V2.vector4_u32[1]; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vzipq_f32( V1, V2 ).val[0]; +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_unpacklo_ps( V1, V2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorMergeZW +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_u32[0] = V1.vector4_u32[2]; + Result.vector4_u32[1] = V2.vector4_u32[2]; + Result.vector4_u32[2] = V1.vector4_u32[3]; + Result.vector4_u32[3] = V2.vector4_u32[3]; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vzipq_f32( V1, V2 ).val[1]; +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_unpackhi_ps( V1, V2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements) +{ + assert( Elements < 4 ); + _Analysis_assume_( Elements < 4 ); + return XMVectorPermute(V1, V2, Elements, ((Elements) + 1), ((Elements) + 2), ((Elements) + 3)); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements) +{ + assert( Elements < 4 ); + _Analysis_assume_( Elements < 4 ); + return XMVectorSwizzle( V, Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorRotateRight(FXMVECTOR V, uint32_t Elements) +{ + assert( Elements < 4 ); + _Analysis_assume_( Elements < 4 ); + return XMVectorSwizzle( V, (4 - (Elements)) & 3, (5 - (Elements)) & 3, (6 - (Elements)) & 3, (7 - (Elements)) & 3 ); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS, uint32_t VSLeftRotateElements, + uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3) +{ + XMVECTOR Control = XMVectorSelectControl(Select0&1, Select1&1, Select2&1, Select3&1); + return XMVectorSelect( VD, XMVectorRotateLeft(VS, VSLeftRotateElements), Control ); +} + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[1] = (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[2] = (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[3] = (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFF : 0; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vceqq_f32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmpeq_ps( V1, V2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMVECTOR XMVectorEqualR +( + uint32_t* pCR, + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + assert( pCR != NULL ); +#if defined(_XM_NO_INTRINSICS_) + uint32_t ux = (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + uint32_t uy = (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + uint32_t uz = (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + uint32_t uw = (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + uint32_t CR = 0; + if (ux&uy&uz&uw) + { + // All elements are greater + CR = XM_CRMASK_CR6TRUE; + } + else if (!(ux|uy|uz|uw)) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + + XMVECTOR Control; + Control.vector4_u32[0] = ux; + Control.vector4_u32[1] = uy; + Control.vector4_u32[2] = uz; + Control.vector4_u32[3] = uw; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vceqq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + uint32_t CR = 0; + if ( r == 0xFFFFFFFFU ) + { + // All elements are equal + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + // All elements are not equal + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); + uint32_t CR = 0; + int iTest = _mm_movemask_ps(vTemp); + if (iTest==0xf) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vTemp; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Treat the components of the vectors as unsigned integers and +// compare individual bits between the two. This is useful for +// comparing control vectors and result vectors returned from +// other comparison operations. + +inline XMVECTOR XMVectorEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = (V1.vector4_u32[0] == V2.vector4_u32[0]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[1] = (V1.vector4_u32[1] == V2.vector4_u32[1]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[2] = (V1.vector4_u32[2] == V2.vector4_u32[2]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[3] = (V1.vector4_u32[3] == V2.vector4_u32[3]) ? 0xFFFFFFFF : 0; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vceqq_u32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_cmpeq_epi32( _mm_castps_si128(V1),_mm_castps_si128(V2) ); + return reinterpret_cast<__m128 *>(&V)[0]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMVECTOR XMVectorEqualIntR +( + uint32_t* pCR, + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + assert( pCR != NULL ); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control = XMVectorEqualInt(V1, V2); + + *pCR = 0; + if (XMVector4EqualInt(Control, XMVectorTrueInt())) + { + // All elements are equal + *pCR |= XM_CRMASK_CR6TRUE; + } + else if (XMVector4EqualInt(Control, XMVectorFalseInt())) + { + // All elements are not equal + *pCR |= XM_CRMASK_CR6FALSE; + } + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vceqq_u32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + uint32_t CR = 0; + if ( r == 0xFFFFFFFFU ) + { + // All elements are equal + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + // All elements are not equal + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_cmpeq_epi32( _mm_castps_si128(V1),_mm_castps_si128(V2) ); + int iTemp = _mm_movemask_ps(reinterpret_cast<const __m128*>(&V)[0]); + uint32_t CR = 0; + if (iTemp==0x0F) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTemp) + { + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return reinterpret_cast<__m128 *>(&V)[0]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorNearEqual +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Epsilon +) +{ +#if defined(_XM_NO_INTRINSICS_) + + float fDeltax = V1.vector4_f32[0]-V2.vector4_f32[0]; + float fDeltay = V1.vector4_f32[1]-V2.vector4_f32[1]; + float fDeltaz = V1.vector4_f32[2]-V2.vector4_f32[2]; + float fDeltaw = V1.vector4_f32[3]-V2.vector4_f32[3]; + + fDeltax = fabsf(fDeltax); + fDeltay = fabsf(fDeltay); + fDeltaz = fabsf(fDeltaz); + fDeltaw = fabsf(fDeltaw); + + XMVECTOR Control; + Control.vector4_u32[0] = (fDeltax <= Epsilon.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[1] = (fDeltay <= Epsilon.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[2] = (fDeltaz <= Epsilon.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[3] = (fDeltaw <= Epsilon.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vDelta = vsubq_f32(V1,V2); + return vacleq_f32( vDelta, Epsilon ); +#elif defined(_XM_SSE_INTRINSICS_) + // Get the difference + XMVECTOR vDelta = _mm_sub_ps(V1,V2); + // Get the absolute value of the difference + XMVECTOR vTemp = _mm_setzero_ps(); + vTemp = _mm_sub_ps(vTemp,vDelta); + vTemp = _mm_max_ps(vTemp,vDelta); + vTemp = _mm_cmple_ps(vTemp,Epsilon); + return vTemp; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorNotEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = (V1.vector4_f32[0] != V2.vector4_f32[0]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[1] = (V1.vector4_f32[1] != V2.vector4_f32[1]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[2] = (V1.vector4_f32[2] != V2.vector4_f32[2]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[3] = (V1.vector4_f32[3] != V2.vector4_f32[3]) ? 0xFFFFFFFF : 0; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vmvnq_u32(vceqq_f32(V1, V2)); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmpneq_ps( V1, V2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorNotEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = (V1.vector4_u32[0] != V2.vector4_u32[0]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[1] = (V1.vector4_u32[1] != V2.vector4_u32[1]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[2] = (V1.vector4_u32[2] != V2.vector4_u32[2]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[3] = (V1.vector4_u32[3] != V2.vector4_u32[3]) ? 0xFFFFFFFFU : 0; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vmvnq_u32(vceqq_u32(V1, V2)); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_cmpeq_epi32( _mm_castps_si128(V1),_mm_castps_si128(V2) ); + return _mm_xor_ps(reinterpret_cast<__m128 *>(&V)[0],g_XMNegOneMask); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorGreater +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[1] = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[2] = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[3] = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFF : 0; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vcgtq_f32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmpgt_ps( V1, V2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMVECTOR XMVectorGreaterR +( + uint32_t* pCR, + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + assert( pCR != NULL ); +#if defined(_XM_NO_INTRINSICS_) + + uint32_t ux = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + uint32_t uy = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + uint32_t uz = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + uint32_t uw = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + uint32_t CR = 0; + if (ux&uy&uz&uw) + { + // All elements are greater + CR = XM_CRMASK_CR6TRUE; + } + else if (!(ux|uy|uz|uw)) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + + XMVECTOR Control; + Control.vector4_u32[0] = ux; + Control.vector4_u32[1] = uy; + Control.vector4_u32[2] = uz; + Control.vector4_u32[3] = uw; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vcgtq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + uint32_t CR = 0; + if ( r == 0xFFFFFFFFU ) + { + // All elements are greater + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); + uint32_t CR = 0; + int iTest = _mm_movemask_ps(vTemp); + if (iTest==0xf) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vTemp; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorGreaterOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[1] = (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[2] = (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[3] = (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vcgeq_f32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmpge_ps( V1, V2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMVECTOR XMVectorGreaterOrEqualR +( + uint32_t* pCR, + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + assert( pCR != NULL ); +#if defined(_XM_NO_INTRINSICS_) + + uint32_t ux = (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + uint32_t uy = (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + uint32_t uz = (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + uint32_t uw = (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + uint32_t CR = 0; + if (ux&uy&uz&uw) + { + // All elements are greater + CR = XM_CRMASK_CR6TRUE; + } + else if (!(ux|uy|uz|uw)) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + + XMVECTOR Control; + Control.vector4_u32[0] = ux; + Control.vector4_u32[1] = uy; + Control.vector4_u32[2] = uz; + Control.vector4_u32[3] = uw; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vcgeq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + uint32_t CR = 0; + if ( r == 0xFFFFFFFFU ) + { + // All elements are greater or equal + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + // All elements are not greater or equal + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); + uint32_t CR = 0; + int iTest = _mm_movemask_ps(vTemp); + if (iTest==0xf) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + // All elements are not greater + CR = XM_CRMASK_CR6FALSE; + } + *pCR = CR; + return vTemp; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorLess +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = (V1.vector4_f32[0] < V2.vector4_f32[0]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[1] = (V1.vector4_f32[1] < V2.vector4_f32[1]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[2] = (V1.vector4_f32[2] < V2.vector4_f32[2]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[3] = (V1.vector4_f32[3] < V2.vector4_f32[3]) ? 0xFFFFFFFF : 0; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vcltq_f32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmplt_ps( V1, V2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorLessOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = (V1.vector4_f32[0] <= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[1] = (V1.vector4_f32[1] <= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[2] = (V1.vector4_f32[2] <= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[3] = (V1.vector4_f32[3] <= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vcleq_f32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_cmple_ps( V1, V2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorInBounds +( + FXMVECTOR V, + FXMVECTOR Bounds +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[1] = (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[2] = (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFF : 0; + Control.vector4_u32[3] = (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFF : 0; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = vcleq_f32(V,Bounds); + // Negate the bounds + XMVECTOR vTemp2 = vnegq_f32(Bounds); + // Test if greater or equal (Reversed) + vTemp2 = vcleq_f32(vTemp2,V); + // Blend answers + vTemp1 = vandq_u32(vTemp1,vTemp2); + return vTemp1; +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2,V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1,vTemp2); + return vTemp1; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMVECTOR XMVectorInBoundsR +( + uint32_t* pCR, + FXMVECTOR V, + FXMVECTOR Bounds +) +{ + assert( pCR != NULL ); +#if defined(_XM_NO_INTRINSICS_) + + uint32_t ux = (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + uint32_t uy = (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + uint32_t uz = (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + uint32_t uw = (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + + uint32_t CR = 0; + if (ux&uy&uz&uw) + { + // All elements are in bounds + CR = XM_CRMASK_CR6BOUNDS; + } + *pCR = CR; + + XMVECTOR Control; + Control.vector4_u32[0] = ux; + Control.vector4_u32[1] = uy; + Control.vector4_u32[2] = uz; + Control.vector4_u32[3] = uw; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = vcleq_f32(V,Bounds); + // Negate the bounds + XMVECTOR vTemp2 = vnegq_f32(Bounds); + // Test if greater or equal (Reversed) + vTemp2 = vcleq_f32(vTemp2,V); + // Blend answers + vTemp1 = vandq_u32(vTemp1,vTemp2); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + uint32_t CR = 0; + if ( r == 0xFFFFFFFFU ) + { + // All elements are in bounds + CR = XM_CRMASK_CR6BOUNDS; + } + *pCR = CR; + return vTemp1; +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2,V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1,vTemp2); + + uint32_t CR = 0; + if (_mm_movemask_ps(vTemp1)==0xf) { + // All elements are in bounds + CR = XM_CRMASK_CR6BOUNDS; + } + *pCR = CR; + return vTemp1; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorIsNaN +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = XMISNAN(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[1] = XMISNAN(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[2] = XMISNAN(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[3] = XMISNAN(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test against itself. NaN is always not equal + __n128 vTempNan = vceqq_f32( V, V ); + // Flip results + return vmvnq_u32( vTempNan ); +#elif defined(_XM_SSE_INTRINSICS_) + // Test against itself. NaN is always not equal + return _mm_cmpneq_ps(V,V); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorIsInfinite +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Control; + Control.vector4_u32[0] = XMISINF(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[1] = XMISINF(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[2] = XMISINF(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0; + Control.vector4_u32[3] = XMISINF(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0; + return Control; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Mask off the sign bit + __n128 vTemp = vandq_u32(V,g_XMAbsMask); + // Compare to infinity + vTemp = vceqq_f32(vTemp,g_XMInfinity); + // If any are infinity, the signs are true. + return vTemp; +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bit + __m128 vTemp = _mm_and_ps(V,g_XMAbsMask); + // Compare to infinity + vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity); + // If any are infinity, the signs are true. + return vTemp; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Rounding and clamping operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorMin +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_f32[0] = (V1.vector4_f32[0] < V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0]; + Result.vector4_f32[1] = (V1.vector4_f32[1] < V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1]; + Result.vector4_f32[2] = (V1.vector4_f32[2] < V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2]; + Result.vector4_f32[3] = (V1.vector4_f32[3] < V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3]; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vminq_f32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_min_ps( V1, V2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorMax +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_f32[0] = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0]; + Result.vector4_f32[1] = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1]; + Result.vector4_f32[2] = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2]; + Result.vector4_f32[3] = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3]; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vmaxq_f32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_max_ps( V1, V2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorRound +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + const XMVECTOR Zero = XMVectorZero(); + const XMVECTOR BiasPos = XMVectorReplicate(0.5f); + const XMVECTOR BiasNeg = XMVectorReplicate(-0.5f); + + XMVECTOR Bias = XMVectorLess(V, Zero); + Bias = XMVectorSelect(BiasPos, BiasNeg, Bias); + XMVECTOR Result = XMVectorAdd(V, Bias); + Result = XMVectorTruncate(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vTest = vabsq_f32( V ); + vTest = vcltq_f32( vTest, g_XMNoFraction ); + + __n128 Bias = vcltq_f32( V, vdupq_n_u32(0) ); + + __n128 BiasPos = vdupq_n_f32( 0.5f ); + __n128 BiasNeg = vdupq_n_f32( -0.5f ); + Bias = vbslq_f32( Bias, BiasNeg, BiasPos ); + __n128 V0 = vaddq_f32( V, Bias ); + __n128 vInt = vcvtq_s32_f32( V0 ); + __n128 vResult = vcvtq_f32_s32( vInt ); + + // All numbers less than 8388608 will use the round to int + // All others, use the ORIGINAL value + return vbslq_f32( vTest, vResult, V ); +#elif defined(_XM_SSE_INTRINSICS_) + // To handle NAN, INF and numbers greater than 8388608, use masking + // Get the abs value + __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask); + // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF + vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction); + // Convert to int and back to float for rounding + __m128i vInt = _mm_cvtps_epi32(V); + // Convert back to floats + XMVECTOR vResult = _mm_cvtepi32_ps(vInt); + // All numbers less than 8388608 will use the round to int + vResult = _mm_and_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]); + // All others, use the ORIGINAL value + vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V)); + vResult = _mm_or_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorTruncate +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + uint32_t i; + + // Avoid C4701 + Result.vector4_f32[0] = 0.0f; + + for (i = 0; i < 4; i++) + { + if (XMISNAN(V.vector4_f32[i])) + { + Result.vector4_u32[i] = 0x7FC00000; + } + else if (fabsf(V.vector4_f32[i]) < 8388608.0f) + { + Result.vector4_f32[i] = (float)((int32_t)V.vector4_f32[i]); + } + else + { + Result.vector4_f32[i] = V.vector4_f32[i]; + } + } + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vTest = vabsq_f32( V ); + vTest = vcltq_f32( vTest, g_XMNoFraction ); + + __n128 vInt = vcvtq_s32_f32( V ); + __n128 vResult = vcvtq_f32_s32( vInt ); + + // All numbers less than 8388608 will use the round to int + // All others, use the ORIGINAL value + return vbslq_f32( vTest, vResult, V ); +#elif defined(_XM_SSE_INTRINSICS_) + // To handle NAN, INF and numbers greater than 8388608, use masking + // Get the abs value + __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask); + // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF + vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction); + // Convert to int and back to float for rounding with truncation + __m128i vInt = _mm_cvttps_epi32(V); + // Convert back to floats + XMVECTOR vResult = _mm_cvtepi32_ps(vInt); + // All numbers less than 8388608 will use the round to int + vResult = _mm_and_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]); + // All others, use the ORIGINAL value + vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V)); + vResult = _mm_or_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorFloor +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR vResult = { + floorf(V.vector4_f32[0]), + floorf(V.vector4_f32[1]), + floorf(V.vector4_f32[2]), + floorf(V.vector4_f32[3]) + }; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 V0 = vsubq_f32( V, vdupq_n_u32(0x3EFFFFA0) ); + return XMVectorRound(V0); +#elif defined(_XM_SSE_INTRINSICS_) + // To handle NAN, INF and numbers greater than 8388608, use masking + // Get the abs value + __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask); + // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF + vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction); + // Convert to int and back to float for rounding + XMVECTOR vResult = _mm_sub_ps(V,g_XMOneHalfMinusEpsilon); + __m128i vInt = _mm_cvtps_epi32(vResult); + // Convert back to floats + vResult = _mm_cvtepi32_ps(vInt); + // All numbers less than 8388608 will use the round to int + vResult = _mm_and_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]); + // All others, use the ORIGINAL value + vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V)); + vResult = _mm_or_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorCeiling +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult = { + ceilf(V.vector4_f32[0]), + ceilf(V.vector4_f32[1]), + ceilf(V.vector4_f32[2]), + ceilf(V.vector4_f32[3]) + }; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 V0 = vaddq_f32( V, vdupq_n_u32(0x3EFFFFA0) ); + return XMVectorRound(V0); +#elif defined(_XM_SSE_INTRINSICS_) + // To handle NAN, INF and numbers greater than 8388608, use masking + // Get the abs value + __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask); + // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF + vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction); + // Convert to int and back to float for rounding + XMVECTOR vResult = _mm_add_ps(V,g_XMOneHalfMinusEpsilon); + __m128i vInt = _mm_cvtps_epi32(vResult); + // Convert back to floats + vResult = _mm_cvtepi32_ps(vInt); + // All numbers less than 8388608 will use the round to int + vResult = _mm_and_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]); + // All others, use the ORIGINAL value + vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V)); + vResult = _mm_or_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorClamp +( + FXMVECTOR V, + FXMVECTOR Min, + FXMVECTOR Max +) +{ + assert(XMVector4LessOrEqual(Min, Max)); + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVectorMax(Min, V); + Result = XMVectorMin(Max, Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vResult; + vResult = vmaxq_f32(Min,V); + vResult = vminq_f32(vResult,Max); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult; + vResult = _mm_max_ps(Min,V); + vResult = _mm_min_ps(vResult,Max); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorSaturate +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + const XMVECTOR Zero = XMVectorZero(); + + return XMVectorClamp(V, Zero, g_XMOne.v); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Set <0 to 0 + XMVECTOR vResult = vmaxq_f32(V, vdupq_n_u32(0) ); + // Set>1 to 1 + return vminq_f32(vResult, vdupq_n_f32(1.0f) ); +#elif defined(_XM_SSE_INTRINSICS_) + // Set <0 to 0 + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + // Set>1 to 1 + return _mm_min_ps(vResult,g_XMOne); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Bitwise logical operations +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorAndInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_u32[0] = V1.vector4_u32[0] & V2.vector4_u32[0]; + Result.vector4_u32[1] = V1.vector4_u32[1] & V2.vector4_u32[1]; + Result.vector4_u32[2] = V1.vector4_u32[2] & V2.vector4_u32[2]; + Result.vector4_u32[3] = V1.vector4_u32[3] & V2.vector4_u32[3]; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vandq_u32(V1,V2); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_and_ps(V1,V2); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorAndCInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_u32[0] = V1.vector4_u32[0] & ~V2.vector4_u32[0]; + Result.vector4_u32[1] = V1.vector4_u32[1] & ~V2.vector4_u32[1]; + Result.vector4_u32[2] = V1.vector4_u32[2] & ~V2.vector4_u32[2]; + Result.vector4_u32[3] = V1.vector4_u32[3] & ~V2.vector4_u32[3]; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vbicq_u32(V1,V2); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_andnot_si128( _mm_castps_si128(V2), _mm_castps_si128(V1) ); + return reinterpret_cast<__m128 *>(&V)[0]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorOrInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_u32[0] = V1.vector4_u32[0] | V2.vector4_u32[0]; + Result.vector4_u32[1] = V1.vector4_u32[1] | V2.vector4_u32[1]; + Result.vector4_u32[2] = V1.vector4_u32[2] | V2.vector4_u32[2]; + Result.vector4_u32[3] = V1.vector4_u32[3] | V2.vector4_u32[3]; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vorrq_u32(V1,V2); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_or_si128( _mm_castps_si128(V1), _mm_castps_si128(V2) ); + return reinterpret_cast<__m128 *>(&V)[0]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorNorInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_u32[0] = ~(V1.vector4_u32[0] | V2.vector4_u32[0]); + Result.vector4_u32[1] = ~(V1.vector4_u32[1] | V2.vector4_u32[1]); + Result.vector4_u32[2] = ~(V1.vector4_u32[2] | V2.vector4_u32[2]); + Result.vector4_u32[3] = ~(V1.vector4_u32[3] | V2.vector4_u32[3]); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 Result = vorrq_u32(V1,V2); + return vbicq_u32(g_XMNegOneMask, Result); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i Result; + Result = _mm_or_si128( _mm_castps_si128(V1), _mm_castps_si128(V2) ); + Result = _mm_andnot_si128( Result,g_XMNegOneMask); + return reinterpret_cast<__m128 *>(&Result)[0]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorXorInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_u32[0] = V1.vector4_u32[0] ^ V2.vector4_u32[0]; + Result.vector4_u32[1] = V1.vector4_u32[1] ^ V2.vector4_u32[1]; + Result.vector4_u32[2] = V1.vector4_u32[2] ^ V2.vector4_u32[2]; + Result.vector4_u32[3] = V1.vector4_u32[3] ^ V2.vector4_u32[3]; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return veorq_u32(V1,V2); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_xor_si128( _mm_castps_si128(V1), _mm_castps_si128(V2) ); + return reinterpret_cast<__m128 *>(&V)[0]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorNegate +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_f32[0] = -V.vector4_f32[0]; + Result.vector4_f32[1] = -V.vector4_f32[1]; + Result.vector4_f32[2] = -V.vector4_f32[2]; + Result.vector4_f32[3] = -V.vector4_f32[3]; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vnegq_f32(V); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR Z; + + Z = _mm_setzero_ps(); + + return _mm_sub_ps( Z, V ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorAdd +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_f32[0] = V1.vector4_f32[0] + V2.vector4_f32[0]; + Result.vector4_f32[1] = V1.vector4_f32[1] + V2.vector4_f32[1]; + Result.vector4_f32[2] = V1.vector4_f32[2] + V2.vector4_f32[2]; + Result.vector4_f32[3] = V1.vector4_f32[3] + V2.vector4_f32[3]; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vaddq_f32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_add_ps( V1, V2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorAddAngles +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + const XMVECTOR Zero = XMVectorZero(); + + // Add the given angles together. If the range of V1 is such + // that -Pi <= V1 < Pi and the range of V2 is such that + // -2Pi <= V2 <= 2Pi, then the range of the resulting angle + // will be -Pi <= Result < Pi. + XMVECTOR Result = XMVectorAdd(V1, V2); + + XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v); + XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask); + + Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v); + Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask); + + Result = XMVectorAdd(Result, Offset); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Adjust the angles + __n128 vResult = vaddq_f32(V1,V2); + // Less than Pi? + __n128 vOffset = vcltq_f32(vResult,g_XMNegativePi); + vOffset = vandq_u32(vOffset,g_XMTwoPi); + // Add 2Pi to all entries less than -Pi + vResult = vaddq_f32(vResult,vOffset); + // Greater than or equal to Pi? + vOffset = vcgeq_f32(vResult,g_XMPi); + vOffset = vandq_u32(vOffset,g_XMTwoPi); + // Sub 2Pi to all entries greater than Pi + vResult = vsubq_f32(vResult,vOffset); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Adjust the angles + XMVECTOR vResult = _mm_add_ps(V1,V2); + // Less than Pi? + XMVECTOR vOffset = _mm_cmplt_ps(vResult,g_XMNegativePi); + vOffset = _mm_and_ps(vOffset,g_XMTwoPi); + // Add 2Pi to all entries less than -Pi + vResult = _mm_add_ps(vResult,vOffset); + // Greater than or equal to Pi? + vOffset = _mm_cmpge_ps(vResult,g_XMPi); + vOffset = _mm_and_ps(vOffset,g_XMTwoPi); + // Sub 2Pi to all entries greater than Pi + vResult = _mm_sub_ps(vResult,vOffset); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorSubtract +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_f32[0] = V1.vector4_f32[0] - V2.vector4_f32[0]; + Result.vector4_f32[1] = V1.vector4_f32[1] - V2.vector4_f32[1]; + Result.vector4_f32[2] = V1.vector4_f32[2] - V2.vector4_f32[2]; + Result.vector4_f32[3] = V1.vector4_f32[3] - V2.vector4_f32[3]; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vsubq_f32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_sub_ps( V1, V2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorSubtractAngles +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + const XMVECTOR Zero = XMVectorZero(); + + // Subtract the given angles. If the range of V1 is such + // that -Pi <= V1 < Pi and the range of V2 is such that + // -2Pi <= V2 <= 2Pi, then the range of the resulting angle + // will be -Pi <= Result < Pi. + XMVECTOR Result = XMVectorSubtract(V1, V2); + + XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v); + XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask); + + Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v); + Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask); + + Result = XMVectorAdd(Result, Offset); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Adjust the angles + __n128 vResult = vsubq_f32(V1,V2); + // Less than Pi? + __n128 vOffset = vcltq_f32(vResult,g_XMNegativePi); + vOffset = vandq_u32(vOffset,g_XMTwoPi); + // Add 2Pi to all entries less than -Pi + vResult = vaddq_f32(vResult,vOffset); + // Greater than or equal to Pi? + vOffset = vcgeq_f32(vResult,g_XMPi); + vOffset = vandq_u32(vOffset,g_XMTwoPi); + // Sub 2Pi to all entries greater than Pi + vResult = vsubq_f32(vResult,vOffset); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Adjust the angles + XMVECTOR vResult = _mm_sub_ps(V1,V2); + // Less than Pi? + XMVECTOR vOffset = _mm_cmplt_ps(vResult,g_XMNegativePi); + vOffset = _mm_and_ps(vOffset,g_XMTwoPi); + // Add 2Pi to all entries less than -Pi + vResult = _mm_add_ps(vResult,vOffset); + // Greater than or equal to Pi? + vOffset = _mm_cmpge_ps(vResult,g_XMPi); + vOffset = _mm_and_ps(vOffset,g_XMTwoPi); + // Sub 2Pi to all entries greater than Pi + vResult = _mm_sub_ps(vResult,vOffset); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorMultiply +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result = { + V1.vector4_f32[0] * V2.vector4_f32[0], + V1.vector4_f32[1] * V2.vector4_f32[1], + V1.vector4_f32[2] * V2.vector4_f32[2], + V1.vector4_f32[3] * V2.vector4_f32[3] + }; + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vmulq_f32( V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_mul_ps( V1, V2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorMultiplyAdd +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult = { + (V1.vector4_f32[0] * V2.vector4_f32[0]) + V3.vector4_f32[0], + (V1.vector4_f32[1] * V2.vector4_f32[1]) + V3.vector4_f32[1], + (V1.vector4_f32[2] * V2.vector4_f32[2]) + V3.vector4_f32[2], + (V1.vector4_f32[3] * V2.vector4_f32[3]) + V3.vector4_f32[3] + }; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vmlaq_f32( V3, V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_mul_ps( V1, V2 ); + return _mm_add_ps(vResult, V3 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorDivide +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = V1.vector4_f32[0] / V2.vector4_f32[0]; + Result.vector4_f32[1] = V1.vector4_f32[1] / V2.vector4_f32[1]; + Result.vector4_f32[2] = V1.vector4_f32[2] / V2.vector4_f32[2]; + Result.vector4_f32[3] = V1.vector4_f32[3] / V2.vector4_f32[3]; + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // 2 iterations of Newton-Raphson refinement of reciprocal + __n128 Reciprocal = vrecpeq_f32(V2); + __n128 S = vrecpsq_f32( Reciprocal, V2 ); + Reciprocal = vmulq_f32( S, Reciprocal ); + S = vrecpsq_f32( Reciprocal, V2 ); + Reciprocal = vmulq_f32( S, Reciprocal ); + return vmulq_f32( V1, Reciprocal ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_div_ps( V1, V2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorNegativeMultiplySubtract +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR vResult = { + V3.vector4_f32[0] - (V1.vector4_f32[0] * V2.vector4_f32[0]), + V3.vector4_f32[1] - (V1.vector4_f32[1] * V2.vector4_f32[1]), + V3.vector4_f32[2] - (V1.vector4_f32[2] * V2.vector4_f32[2]), + V3.vector4_f32[3] - (V1.vector4_f32[3] * V2.vector4_f32[3]) + }; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vmlsq_f32( V3, V1, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR R = _mm_mul_ps( V1, V2 ); + return _mm_sub_ps( V3, R ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorScale +( + FXMVECTOR V, + float ScaleFactor +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult = { + V.vector4_f32[0] * ScaleFactor, + V.vector4_f32[1] * ScaleFactor, + V.vector4_f32[2] * ScaleFactor, + V.vector4_f32[3] * ScaleFactor + }; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vmulq_n_f32( V, ScaleFactor ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_set_ps1(ScaleFactor); + return _mm_mul_ps(vResult,V); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorReciprocalEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = 1.f / V.vector4_f32[0]; + Result.vector4_f32[1] = 1.f / V.vector4_f32[1]; + Result.vector4_f32[2] = 1.f / V.vector4_f32[2]; + Result.vector4_f32[3] = 1.f / V.vector4_f32[3]; + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vrecpeq_f32(V); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_rcp_ps(V); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorReciprocal +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = 1.f / V.vector4_f32[0]; + Result.vector4_f32[1] = 1.f / V.vector4_f32[1]; + Result.vector4_f32[2] = 1.f / V.vector4_f32[2]; + Result.vector4_f32[3] = 1.f / V.vector4_f32[3]; + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // 2 iterations of Newton-Raphson refinement + __n128 Reciprocal = vrecpeq_f32(V); + __n128 S = vrecpsq_f32( Reciprocal, V ); + Reciprocal = vmulq_f32( S, Reciprocal ); + S = vrecpsq_f32( Reciprocal, V ); + return vmulq_f32( S, Reciprocal ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_div_ps(g_XMOne,V); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Return an estimated square root +inline XMVECTOR XMVectorSqrtEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = sqrtf( V.vector4_f32[0] ); + Result.vector4_f32[1] = sqrtf( V.vector4_f32[1] ); + Result.vector4_f32[2] = sqrtf( V.vector4_f32[2] ); + Result.vector4_f32[3] = sqrtf( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // 1 iteration of Newton-Raphson refinment of sqrt + __n128 S0 = vrsqrteq_f32(V); + __n128 P0 = vmulq_f32( V, S0 ); + __n128 R0 = vrsqrtsq_f32( P0, S0 ); + __n128 S1 = vmulq_f32( S0, R0 ); + + XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v); + XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0) ); + __n128 Result = vmulq_f32( V, S1 ); + XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero); + return XMVectorSelect(V, Result, Select); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_sqrt_ps(V); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorSqrt +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = sqrtf( V.vector4_f32[0] ); + Result.vector4_f32[1] = sqrtf( V.vector4_f32[1] ); + Result.vector4_f32[2] = sqrtf( V.vector4_f32[2] ); + Result.vector4_f32[3] = sqrtf( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // 3 iterations of Newton-Raphson refinment of sqrt + __n128 S0 = vrsqrteq_f32(V); + __n128 P0 = vmulq_f32( V, S0 ); + __n128 R0 = vrsqrtsq_f32( P0, S0 ); + __n128 S1 = vmulq_f32( S0, R0 ); + __n128 P1 = vmulq_f32( V, S1 ); + __n128 R1 = vrsqrtsq_f32( P1, S1 ); + __n128 S2 = vmulq_f32( S1, R1 ); + __n128 P2 = vmulq_f32( V, S2 ); + __n128 R2 = vrsqrtsq_f32( P2, S2 ); + __n128 S3 = vmulq_f32( S2, R2 ); + + XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v); + XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0) ); + __n128 Result = vmulq_f32( V, S3 ); + XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero); + return XMVectorSelect(V, Result, Select); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_sqrt_ps(V); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorReciprocalSqrtEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = 1.f / sqrtf( V.vector4_f32[0] ); + Result.vector4_f32[1] = 1.f / sqrtf( V.vector4_f32[1] ); + Result.vector4_f32[2] = 1.f / sqrtf( V.vector4_f32[2] ); + Result.vector4_f32[3] = 1.f / sqrtf( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vrsqrteq_f32(V); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_rsqrt_ps(V); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorReciprocalSqrt +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = 1.f / sqrtf( V.vector4_f32[0] ); + Result.vector4_f32[1] = 1.f / sqrtf( V.vector4_f32[1] ); + Result.vector4_f32[2] = 1.f / sqrtf( V.vector4_f32[2] ); + Result.vector4_f32[3] = 1.f / sqrtf( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // 2 iterations of Newton-Raphson refinement of reciprocal + __n128 S0 = vrsqrteq_f32(V); + + __n128 P0 = vmulq_f32( V, S0 ); + __n128 R0 = vrsqrtsq_f32( P0, S0 ); + + __n128 S1 = vmulq_f32( S0, R0 ); + __n128 P1 = vmulq_f32( V, S1 ); + __n128 R1 = vrsqrtsq_f32( P1, S1 ); + + return vmulq_f32( S1, R1 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_sqrt_ps(V); + vResult = _mm_div_ps(g_XMOne,vResult); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorExp +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_f32[0] = powf(2.0f, V.vector4_f32[0]); + Result.vector4_f32[1] = powf(2.0f, V.vector4_f32[1]); + Result.vector4_f32[2] = powf(2.0f, V.vector4_f32[2]); + Result.vector4_f32[3] = powf(2.0f, V.vector4_f32[3]); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTORF32 vResult = { + powf(2.0f,vgetq_lane_f32(V, 0)), + powf(2.0f,vgetq_lane_f32(V, 1)), + powf(2.0f,vgetq_lane_f32(V, 2)), + powf(2.0f,vgetq_lane_f32(V, 3)) + }; + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + __declspec(align(16)) float a[4]; + _mm_store_ps( a, V ); + XMVECTOR vResult = _mm_setr_ps( + powf(2.0f,a[0]), + powf(2.0f,a[1]), + powf(2.0f,a[2]), + powf(2.0f,a[3])); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorLog +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + const float fScale = 1.4426950f; // (1.0f / logf(2.0f)); + + XMVECTOR Result; + Result.vector4_f32[0] = logf(V.vector4_f32[0])*fScale; + Result.vector4_f32[1] = logf(V.vector4_f32[1])*fScale; + Result.vector4_f32[2] = logf(V.vector4_f32[2])*fScale; + Result.vector4_f32[3] = logf(V.vector4_f32[3])*fScale; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vScale = vdupq_n_f32(1.0f / logf(2.0f)); + XMVECTORF32 vResult = { + logf(vgetq_lane_f32(V, 0)), + logf(vgetq_lane_f32(V, 1)), + logf(vgetq_lane_f32(V, 2)), + logf(vgetq_lane_f32(V, 3)) + }; + return vmulq_f32( vResult, vScale ); +#elif defined(_XM_SSE_INTRINSICS_) + __declspec(align(16)) float a[4]; + _mm_store_ps( a, V ); + XMVECTOR vScale = _mm_set_ps1(1.0f / logf(2.0f)); + XMVECTOR vResult = _mm_setr_ps( + logf(a[0]), + logf(a[1]), + logf(a[2]), + logf(a[3])); + vResult = _mm_mul_ps(vResult,vScale); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorPow +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_f32[0] = powf(V1.vector4_f32[0], V2.vector4_f32[0]); + Result.vector4_f32[1] = powf(V1.vector4_f32[1], V2.vector4_f32[1]); + Result.vector4_f32[2] = powf(V1.vector4_f32[2], V2.vector4_f32[2]); + Result.vector4_f32[3] = powf(V1.vector4_f32[3], V2.vector4_f32[3]); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTORF32 vResult = { + powf(vgetq_lane_f32(V1, 0), vgetq_lane_f32(V2, 0)), + powf(vgetq_lane_f32(V1, 1), vgetq_lane_f32(V2, 1)), + powf(vgetq_lane_f32(V1, 2), vgetq_lane_f32(V2, 2)), + powf(vgetq_lane_f32(V1, 3), vgetq_lane_f32(V2, 3)) + }; + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + __declspec(align(16)) float a[4]; + __declspec(align(16)) float b[4]; + _mm_store_ps( a, V1 ); + _mm_store_ps( b, V2 ); + XMVECTOR vResult = _mm_setr_ps( + powf(a[0],b[0]), + powf(a[1],b[1]), + powf(a[2],b[2]), + powf(a[3],b[3])); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorAbs +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult = { + fabsf(V.vector4_f32[0]), + fabsf(V.vector4_f32[1]), + fabsf(V.vector4_f32[2]), + fabsf(V.vector4_f32[3]) + }; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vabsq_f32( V ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_setzero_ps(); + vResult = _mm_sub_ps(vResult,V); + vResult = _mm_max_ps(vResult,V); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorMod +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + // V1 % V2 = V1 - V2 * truncate(V1 / V2) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Quotient = XMVectorDivide(V1, V2); + Quotient = XMVectorTruncate(Quotient); + XMVECTOR Result = XMVectorNegativeMultiplySubtract(V2, Quotient, V1); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR vResult = XMVectorDivide(V1, V2); + vResult = XMVectorTruncate(vResult); + return vmlsq_f32( V1, vResult, V2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = _mm_div_ps(V1, V2); + vResult = XMVectorTruncate(vResult); + vResult = _mm_mul_ps(vResult,V2); + vResult = _mm_sub_ps(V1,vResult); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorModAngles +( + FXMVECTOR Angles +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + XMVECTOR Result; + + // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI + V = XMVectorMultiply(Angles, g_XMReciprocalTwoPi.v); + V = XMVectorRound(V); + Result = XMVectorNegativeMultiplySubtract(g_XMTwoPi.v, V, Angles); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI + XMVECTOR vResult = vmulq_f32(Angles,g_XMReciprocalTwoPi); + // Use the inline function due to complexity for rounding + vResult = XMVectorRound(vResult); + return vmlsq_f32( Angles, vResult, g_XMTwoPi ); +#elif defined(_XM_SSE_INTRINSICS_) + // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI + XMVECTOR vResult = _mm_mul_ps(Angles,g_XMReciprocalTwoPi); + // Use the inline function due to complexity for rounding + vResult = XMVectorRound(vResult); + vResult = _mm_mul_ps(vResult,g_XMTwoPi); + vResult = _mm_sub_ps(Angles,vResult); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorSin +( + FXMVECTOR V +) +{ + // 11-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = XMScalarSin( V.vector4_f32[0] ); + Result.vector4_f32[1] = XMScalarSin( V.vector4_f32[1] ); + Result.vector4_f32[2] = XMScalarSin( V.vector4_f32[2] ); + Result.vector4_f32[3] = XMScalarSin( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x). + __n128 sign = vandq_u32(x, g_XMNegativeZero); + __n128 c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __n128 absx = vabsq_f32( x ); + __n128 rflx = vsubq_f32(c, x); + __n128 comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32( comp, x, rflx ); + + __n128 x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR SC1 = g_XMSinCoefficients1; + XMVECTOR Result = vdupq_lane_f32(vget_low_f32(SC1), 0); + + const XMVECTOR SC0 = g_XMSinCoefficients0; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + Result = vmulq_f32(Result, x); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x). + __m128 sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR SC1 = g_XMSinCoefficients1; + XMVECTOR vConstants = XM_PERMUTE_PS( SC1, _MM_SHUFFLE(0, 0, 0, 0) ); + __m128 Result = _mm_mul_ps(vConstants, x2); + + const XMVECTOR SC0 = g_XMSinCoefficients0; + vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(3, 3, 3, 3) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(0, 0, 0, 0) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + Result = _mm_add_ps(Result, g_XMOne); + Result = _mm_mul_ps(Result, x); + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorCos +( + FXMVECTOR V +) +{ + // 10-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = XMScalarCos( V.vector4_f32[0] ); + Result.vector4_f32[1] = XMScalarCos( V.vector4_f32[1] ); + Result.vector4_f32[2] = XMScalarCos( V.vector4_f32[2] ); + Result.vector4_f32[3] = XMScalarCos( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Map V to x in [-pi,pi]. + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + __n128 sign = vandq_u32(x, g_XMNegativeZero); + __n128 c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __n128 absx = vabsq_f32( x ); + __n128 rflx = vsubq_f32(c, x); + __n128 comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32( comp, x, rflx ); + sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne ); + + __n128 x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR CC1 = g_XMCosCoefficients1; + XMVECTOR Result = vdupq_lane_f32(vget_low_f32(CC1), 0); + + const XMVECTOR CC0 = g_XMCosCoefficients0; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + Result = vmulq_f32(Result, sign); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + // Map V to x in [-pi,pi]. + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, g_XMOne); + select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + sign = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR CC1 = g_XMCosCoefficients1; + XMVECTOR vConstants = XM_PERMUTE_PS( CC1, _MM_SHUFFLE(0, 0, 0, 0) ); + __m128 Result = _mm_mul_ps(vConstants, x2); + + const XMVECTOR CC0 = g_XMCosCoefficients0; + vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(3, 3, 3, 3) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(0, 0, 0, 0) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + Result = _mm_add_ps(Result, g_XMOne); + Result = _mm_mul_ps(Result, sign); + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline void XMVectorSinCos +( + XMVECTOR* pSin, + XMVECTOR* pCos, + FXMVECTOR V +) +{ + assert(pSin != NULL); + assert(pCos != NULL); + + // 11/10-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Sin; + XMVECTOR Cos; + + XMScalarSinCos(&Sin.vector4_f32[0], &Cos.vector4_f32[0], V.vector4_f32[0]); + XMScalarSinCos(&Sin.vector4_f32[1], &Cos.vector4_f32[1], V.vector4_f32[1]); + XMScalarSinCos(&Sin.vector4_f32[2], &Cos.vector4_f32[2], V.vector4_f32[2]); + XMScalarSinCos(&Sin.vector4_f32[3], &Cos.vector4_f32[3], V.vector4_f32[3]); + + *pSin = Sin; + *pCos = Cos; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + __n128 sign = vandq_u32(x, g_XMNegativeZero); + __n128 c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __n128 absx = vabsq_f32( x ); + __n128 rflx = vsubq_f32(c, x); + __n128 comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32( comp, x, rflx ); + sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne ); + + __n128 x2 = vmulq_f32(x, x); + + // Compute polynomial approximation for sine + const XMVECTOR SC1 = g_XMSinCoefficients1; + XMVECTOR Result = vdupq_lane_f32(vget_low_f32(SC1), 0); + + const XMVECTOR SC0 = g_XMSinCoefficients0; + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + *pSin = vmulq_f32(Result, x); + + // Compute polynomial approximation for cosine + const XMVECTOR CC1 = g_XMCosCoefficients1; + Result = vdupq_lane_f32(vget_low_f32(CC1), 0); + + const XMVECTOR CC0 = g_XMCosCoefficients0; + vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + *pCos = vmulq_f32(Result, sign); +#elif defined(_XM_SSE_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x). + XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, g_XMOne); + select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + sign = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation of sine + const XMVECTOR SC1 = g_XMSinCoefficients1; + XMVECTOR vConstants = XM_PERMUTE_PS( SC1, _MM_SHUFFLE(0, 0, 0, 0) ); + __m128 Result = _mm_mul_ps(vConstants, x2); + + const XMVECTOR SC0 = g_XMSinCoefficients0; + vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(3, 3, 3, 3) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(0, 0, 0, 0) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + Result = _mm_add_ps(Result, g_XMOne); + Result = _mm_mul_ps(Result, x); + *pSin = Result; + + // Compute polynomial approximation of cosine + const XMVECTOR CC1 = g_XMCosCoefficients1; + vConstants = XM_PERMUTE_PS( CC1, _MM_SHUFFLE(0, 0, 0, 0) ); + Result = _mm_mul_ps(vConstants, x2); + + const XMVECTOR CC0 = g_XMCosCoefficients0; + vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(3, 3, 3, 3) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(0, 0, 0, 0) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + Result = _mm_add_ps(Result, g_XMOne); + Result = _mm_mul_ps(Result, sign); + *pCos = Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorTan +( + FXMVECTOR V +) +{ + // Cody and Waite algorithm to compute tangent. + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = tanf( V.vector4_f32[0] ); + Result.vector4_f32[1] = tanf( V.vector4_f32[1] ); + Result.vector4_f32[2] = tanf( V.vector4_f32[2] ); + Result.vector4_f32[3] = tanf( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 TanCoefficients0 = {1.0f, -4.667168334e-1f, 2.566383229e-2f, -3.118153191e-4f}; + static const XMVECTORF32 TanCoefficients1 = {4.981943399e-7f, -1.333835001e-1f, 3.424887824e-3f, -1.786170734e-5f}; + static const XMVECTORF32 TanConstants = {1.570796371f, 6.077100628e-11f, 0.000244140625f, 0.63661977228f /*2 / Pi*/ }; + static const XMVECTORU32 Mask = {0x1, 0x1, 0x1, 0x1}; + + XMVECTOR TwoDivPi = XMVectorSplatW(TanConstants.v); + + XMVECTOR Zero = XMVectorZero(); + + XMVECTOR C0 = XMVectorSplatX(TanConstants.v); + XMVECTOR C1 = XMVectorSplatY(TanConstants.v); + XMVECTOR Epsilon = XMVectorSplatZ(TanConstants.v); + + XMVECTOR VA = XMVectorMultiply(V, TwoDivPi); + + VA = XMVectorRound(VA); + + XMVECTOR VC = XMVectorNegativeMultiplySubtract(VA, C0, V); + + XMVECTOR VB = XMVectorAbs(VA); + + VC = XMVectorNegativeMultiplySubtract(VA, C1, VC); + +#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + VB = vcvtq_u32_f32( VB ); +#elif defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + reinterpret_cast<__m128i *>(&VB)[0] = _mm_cvttps_epi32(VB); +#else + for (size_t i = 0; i < 4; i++) + { + VB.vector4_u32[i] = (uint32_t)VB.vector4_f32[i]; + } +#endif + + XMVECTOR VC2 = XMVectorMultiply(VC, VC); + + XMVECTOR T7 = XMVectorSplatW(TanCoefficients1.v); + XMVECTOR T6 = XMVectorSplatZ(TanCoefficients1.v); + XMVECTOR T4 = XMVectorSplatX(TanCoefficients1.v); + XMVECTOR T3 = XMVectorSplatW(TanCoefficients0.v); + XMVECTOR T5 = XMVectorSplatY(TanCoefficients1.v); + XMVECTOR T2 = XMVectorSplatZ(TanCoefficients0.v); + XMVECTOR T1 = XMVectorSplatY(TanCoefficients0.v); + XMVECTOR T0 = XMVectorSplatX(TanCoefficients0.v); + + XMVECTOR VBIsEven = XMVectorAndInt(VB, Mask.v); + VBIsEven = XMVectorEqualInt(VBIsEven, Zero); + + XMVECTOR N = XMVectorMultiplyAdd(VC2, T7, T6); + XMVECTOR D = XMVectorMultiplyAdd(VC2, T4, T3); + N = XMVectorMultiplyAdd(VC2, N, T5); + D = XMVectorMultiplyAdd(VC2, D, T2); + N = XMVectorMultiply(VC2, N); + D = XMVectorMultiplyAdd(VC2, D, T1); + N = XMVectorMultiplyAdd(VC, N, VC); + XMVECTOR VCNearZero = XMVectorInBounds(VC, Epsilon); + D = XMVectorMultiplyAdd(VC2, D, T0); + + N = XMVectorSelect(N, VC, VCNearZero); + D = XMVectorSelect(D, g_XMOne.v, VCNearZero); + + XMVECTOR R0 = XMVectorNegate(N); + XMVECTOR R1 = XMVectorDivide(N,D); + R0 = XMVectorDivide(D,R0); + + XMVECTOR VIsZero = XMVectorEqual(V, Zero); + + XMVECTOR Result = XMVectorSelect(R0, R1, VBIsEven); + + Result = XMVectorSelect(Result, Zero, VIsZero); + + return Result; + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorSinH +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = sinhf( V.vector4_f32[0] ); + Result.vector4_f32[1] = sinhf( V.vector4_f32[1] ); + Result.vector4_f32[2] = sinhf( V.vector4_f32[2] ); + Result.vector4_f32[3] = sinhf( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f) + + XMVECTOR V1 = vmlaq_f32( g_XMNegativeOne.v, V, Scale.v ); + XMVECTOR V2 = vmlsq_f32( g_XMNegativeOne.v, V, Scale.v ); + XMVECTOR E1 = XMVectorExp(V1); + XMVECTOR E2 = XMVectorExp(V2); + + return vsubq_f32(E1, E2); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f) + + XMVECTOR V1 = _mm_mul_ps(V, Scale); + V1 = _mm_add_ps(V1,g_XMNegativeOne); + XMVECTOR V2 = _mm_mul_ps(V, Scale); + V2 = _mm_sub_ps(g_XMNegativeOne,V2); + XMVECTOR E1 = XMVectorExp(V1); + XMVECTOR E2 = XMVectorExp(V2); + + return _mm_sub_ps(E1, E2); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorCosH +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = coshf( V.vector4_f32[0] ); + Result.vector4_f32[1] = coshf( V.vector4_f32[1] ); + Result.vector4_f32[2] = coshf( V.vector4_f32[2] ); + Result.vector4_f32[3] = coshf( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f) + + XMVECTOR V1 = vmlaq_f32(g_XMNegativeOne.v, V, Scale.v); + XMVECTOR V2 = vmlsq_f32(g_XMNegativeOne.v, V, Scale.v); + XMVECTOR E1 = XMVectorExp(V1); + XMVECTOR E2 = XMVectorExp(V2); + return vaddq_f32(E1, E2); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f) + + XMVECTOR V1 = _mm_mul_ps(V,Scale.v); + V1 = _mm_add_ps(V1,g_XMNegativeOne.v); + XMVECTOR V2 = _mm_mul_ps(V, Scale.v); + V2 = _mm_sub_ps(g_XMNegativeOne.v,V2); + XMVECTOR E1 = XMVectorExp(V1); + XMVECTOR E2 = XMVectorExp(V2); + return _mm_add_ps(E1, E2); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorTanH +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = tanhf( V.vector4_f32[0] ); + Result.vector4_f32[1] = tanhf( V.vector4_f32[1] ); + Result.vector4_f32[2] = tanhf( V.vector4_f32[2] ); + Result.vector4_f32[3] = tanhf( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Scale = {2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f}; // 2.0f / ln(2.0f) + + XMVECTOR E = vmulq_f32(V, Scale.v); + E = XMVectorExp(E); + E = vmlaq_f32( g_XMOneHalf.v, E, g_XMOneHalf.v ); + E = XMVectorReciprocal(E); + return vsubq_f32(g_XMOne.v, E); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = {2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f}; // 2.0f / ln(2.0f) + + XMVECTOR E = _mm_mul_ps(V, Scale.v); + E = XMVectorExp(E); + E = _mm_mul_ps(E,g_XMOneHalf.v); + E = _mm_add_ps(E,g_XMOneHalf.v); + E = _mm_div_ps(g_XMOne.v,E); + return _mm_sub_ps(g_XMOne.v,E); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorASin +( + FXMVECTOR V +) +{ + // 7-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = XMScalarASin( V.vector4_f32[0] ); + Result.vector4_f32[1] = XMScalarASin( V.vector4_f32[1] ); + Result.vector4_f32[2] = XMScalarASin( V.vector4_f32[2] ); + Result.vector4_f32[3] = XMScalarASin( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 nonnegative = vcgeq_f32(V, g_XMZero); + __n128 x = vabsq_f32(V); + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + __n128 oneMValue = vsubq_f32(g_XMOne, x); + __n128 clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); + __n128 root = XMVectorSqrt(clampOneMValue); + + // Compute polynomial approximation + const XMVECTOR AC1 = g_XMArcCoefficients1; + __n128 t0 = vdupq_lane_f32(vget_high_f32(AC1), 1); + + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0); + t0 = vmlaq_f32( vConstants, t0, x ); + + const XMVECTOR AC0 = g_XMArcCoefficients0; + vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0); + t0 = vmlaq_f32( vConstants, t0, x ); + t0 = vmulq_f32(t0, root); + + __n128 t1 = vsubq_f32(g_XMPi, t0); + t0 = vbslq_f32( nonnegative, t0, t1 ); + t0 = vsubq_f32(g_XMHalfPi, t0); + return t0; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); + __m128 mvalue = _mm_sub_ps(g_XMZero, V); + __m128 x = _mm_max_ps(V, mvalue); // |V| + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + __m128 oneMValue = _mm_sub_ps(g_XMOne, x); + __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); + __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) + + // Compute polynomial approximation + const XMVECTOR AC1 = g_XMArcCoefficients1; + XMVECTOR vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(3, 3, 3, 3) ); + __m128 t0 = _mm_mul_ps(vConstants, x); + + vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(2, 2, 2, 2) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(1, 1, 1, 1) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(0, 0, 0, 0) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + const XMVECTOR AC0 = g_XMArcCoefficients0; + vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(3, 3, 3, 3) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AC0,_MM_SHUFFLE(2, 2, 2, 2) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(1, 1, 1, 1) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(0, 0, 0, 0) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, root); + + __m128 t1 = _mm_sub_ps(g_XMPi, t0); + t0 = _mm_and_ps(nonnegative, t0); + t1 = _mm_andnot_ps(nonnegative, t1); + t0 = _mm_or_ps(t0, t1); + t0 = _mm_sub_ps(g_XMHalfPi, t0); + return t0; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorACos +( + FXMVECTOR V +) +{ + // 7-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = XMScalarACos( V.vector4_f32[0] ); + Result.vector4_f32[1] = XMScalarACos( V.vector4_f32[1] ); + Result.vector4_f32[2] = XMScalarACos( V.vector4_f32[2] ); + Result.vector4_f32[3] = XMScalarACos( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 nonnegative = vcgeq_f32(V, g_XMZero); + __n128 x = vabsq_f32(V); + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + __n128 oneMValue = vsubq_f32(g_XMOne, x); + __n128 clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); + __n128 root = XMVectorSqrt(clampOneMValue); + + // Compute polynomial approximation + const XMVECTOR AC1 = g_XMArcCoefficients1; + __n128 t0 = vdupq_lane_f32(vget_high_f32(AC1), 1); + + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0); + t0 = vmlaq_f32( vConstants, t0, x ); + + const XMVECTOR AC0 = g_XMArcCoefficients0; + vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0); + t0 = vmlaq_f32( vConstants, t0, x ); + t0 = vmulq_f32(t0, root); + + __n128 t1 = vsubq_f32(g_XMPi, t0); + t0 = vbslq_f32( nonnegative, t0, t1 ); + return t0; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); + __m128 mvalue = _mm_sub_ps(g_XMZero, V); + __m128 x = _mm_max_ps(V, mvalue); // |V| + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + __m128 oneMValue = _mm_sub_ps(g_XMOne, x); + __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); + __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) + + // Compute polynomial approximation + const XMVECTOR AC1 = g_XMArcCoefficients1; + XMVECTOR vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(3, 3, 3, 3) ); + __m128 t0 = _mm_mul_ps(vConstants, x); + + vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(2, 2, 2, 2) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(1, 1, 1, 1) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(0, 0, 0, 0) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + const XMVECTOR AC0 = g_XMArcCoefficients0; + vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(3, 3, 3, 3) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(2, 2, 2, 2) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(1, 1, 1, 1) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(0, 0, 0, 0) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, root); + + __m128 t1 = _mm_sub_ps(g_XMPi, t0); + t0 = _mm_and_ps(nonnegative, t0); + t1 = _mm_andnot_ps(nonnegative, t1); + t0 = _mm_or_ps(t0, t1); + return t0; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorATan +( + FXMVECTOR V +) +{ + // 17-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = atanf( V.vector4_f32[0] ); + Result.vector4_f32[1] = atanf( V.vector4_f32[1] ); + Result.vector4_f32[2] = atanf( V.vector4_f32[2] ); + Result.vector4_f32[3] = atanf( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 absV = vabsq_f32(V); + __n128 invV = XMVectorReciprocal(V); + __n128 comp = vcgtq_f32(V, g_XMOne); + __n128 sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne); + comp = vcleq_f32(absV, g_XMOne); + sign = vbslq_f32(comp, g_XMZero, sign); + __n128 x = vbslq_f32(comp, V, invV); + + __n128 x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR TC1 = g_XMATanCoefficients1; + __n128 Result = vdupq_lane_f32(vget_high_f32(TC1), 1); + + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(TC1), 0); + Result = vmlaq_f32( vConstants, Result, x2 ); + + vConstants = vdupq_lane_f32(vget_low_f32(TC1), 1); + Result = vmlaq_f32( vConstants, Result, x2 ); + + vConstants = vdupq_lane_f32(vget_low_f32(TC1), 0); + Result = vmlaq_f32( vConstants, Result, x2 ); + + const XMVECTOR TC0 = g_XMATanCoefficients0; + vConstants = vdupq_lane_f32(vget_high_f32(TC0), 1); + Result = vmlaq_f32( vConstants, Result, x2 ); + + vConstants = vdupq_lane_f32(vget_high_f32(TC0), 0); + Result = vmlaq_f32( vConstants, Result, x2 ); + + vConstants = vdupq_lane_f32(vget_low_f32(TC0), 1); + Result = vmlaq_f32( vConstants, Result, x2 ); + + vConstants = vdupq_lane_f32(vget_low_f32(TC0), 0); + Result = vmlaq_f32( vConstants, Result, x2 ); + + Result = vmlaq_f32( g_XMOne, Result, x2 ); + Result = vmulq_f32( Result, x ); + + __n128 result1 = vmulq_f32(sign, g_XMHalfPi); + result1 = vsubq_f32(result1, Result); + + comp = vceqq_f32(sign, g_XMZero); + Result = vbslq_f32( comp, Result, result1 ); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 absV = XMVectorAbs(V); + __m128 invV = _mm_div_ps(g_XMOne, V); + __m128 comp = _mm_cmpgt_ps(V, g_XMOne); + __m128 select0 = _mm_and_ps(comp, g_XMOne); + __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + __m128 sign = _mm_or_ps(select0, select1); + comp = _mm_cmple_ps(absV, g_XMOne); + select0 = _mm_and_ps(comp, g_XMZero); + select1 = _mm_andnot_ps(comp, sign); + sign = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, V); + select1 = _mm_andnot_ps(comp, invV); + __m128 x = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR TC1 = g_XMATanCoefficients1; + XMVECTOR vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(3, 3, 3, 3) ); + __m128 Result = _mm_mul_ps(vConstants, x2); + + vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(0, 0, 0, 0) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + const XMVECTOR TC0 = g_XMATanCoefficients0; + vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(3, 3, 3, 3) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(0, 0, 0, 0) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + Result = _mm_add_ps(Result, g_XMOne); + Result = _mm_mul_ps(Result, x); + __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi); + result1 = _mm_sub_ps(result1, Result); + + comp = _mm_cmpeq_ps(sign, g_XMZero); + select0 = _mm_and_ps(comp, Result); + select1 = _mm_andnot_ps(comp, result1); + Result = _mm_or_ps(select0, select1); + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorATan2 +( + FXMVECTOR Y, + FXMVECTOR X +) +{ + // Return the inverse tangent of Y / X in the range of -Pi to Pi with the following exceptions: + + // Y == 0 and X is Negative -> Pi with the sign of Y + // y == 0 and x is positive -> 0 with the sign of y + // Y != 0 and X == 0 -> Pi / 2 with the sign of Y + // Y != 0 and X is Negative -> atan(y/x) + (PI with the sign of Y) + // X == -Infinity and Finite Y -> Pi with the sign of Y + // X == +Infinity and Finite Y -> 0 with the sign of Y + // Y == Infinity and X is Finite -> Pi / 2 with the sign of Y + // Y == Infinity and X == -Infinity -> 3Pi / 4 with the sign of Y + // Y == Infinity and X == +Infinity -> Pi / 4 with the sign of Y + + static const XMVECTORF32 ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f}; + + XMVECTOR Zero = XMVectorZero(); + XMVECTOR ATanResultValid = XMVectorTrueInt(); + + XMVECTOR Pi = XMVectorSplatX(ATan2Constants); + XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants); + XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants); + XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants); + + XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero); + XMVECTOR XEqualsZero = XMVectorEqual(X, Zero); + XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v); + XIsPositive = XMVectorEqualInt(XIsPositive, Zero); + XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y); + XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X); + + XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v); + Pi = XMVectorOrInt(Pi, YSign); + PiOverTwo = XMVectorOrInt(PiOverTwo, YSign); + PiOverFour = XMVectorOrInt(PiOverFour, YSign); + ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign); + + XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive); + XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero); + XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero); + XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive); + XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity); + XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity); + ATanResultValid = XMVectorEqualInt(Result, ATanResultValid); + + XMVECTOR V = XMVectorDivide(Y, X); + + XMVECTOR R0 = XMVectorATan(V); + + R1 = XMVectorSelect( Pi, Zero, XIsPositive ); + R2 = XMVectorAdd(R0, R1); + + return XMVectorSelect(Result, R2, ATanResultValid); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorSinEst +( + FXMVECTOR V +) +{ + // 7-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = XMScalarSinEst( V.vector4_f32[0] ); + Result.vector4_f32[1] = XMScalarSinEst( V.vector4_f32[1] ); + Result.vector4_f32[2] = XMScalarSinEst( V.vector4_f32[2] ); + Result.vector4_f32[3] = XMScalarSinEst( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x). + __n128 sign = vandq_u32(x, g_XMNegativeZero); + __n128 c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __n128 absx = vabsq_f32( x ); + __n128 rflx = vsubq_f32(c, x); + __n128 comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32( comp, x, rflx ); + + __n128 x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR SEC = g_XMSinCoefficients1; + XMVECTOR Result = vdupq_lane_f32(vget_high_f32(SEC), 1); + + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + Result = vmulq_f32(Result, x); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x). + __m128 sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR SEC = g_XMSinCoefficients1; + XMVECTOR vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(3, 3, 3, 3) ); + __m128 Result = _mm_mul_ps(vConstants, x2); + + vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + Result = _mm_add_ps(Result, g_XMOne); + Result = _mm_mul_ps(Result, x); + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorCosEst +( + FXMVECTOR V +) +{ + // 6-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = XMScalarCosEst( V.vector4_f32[0] ); + Result.vector4_f32[1] = XMScalarCosEst( V.vector4_f32[1] ); + Result.vector4_f32[2] = XMScalarCosEst( V.vector4_f32[2] ); + Result.vector4_f32[3] = XMScalarCosEst( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Map V to x in [-pi,pi]. + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + __n128 sign = vandq_u32(x, g_XMNegativeZero); + __n128 c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __n128 absx = vabsq_f32( x ); + __n128 rflx = vsubq_f32(c, x); + __n128 comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32( comp, x, rflx ); + sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne ); + + __n128 x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR CEC = g_XMCosCoefficients1; + XMVECTOR Result = vdupq_lane_f32(vget_high_f32(CEC), 1); + + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + Result = vmulq_f32(Result, sign); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + // Map V to x in [-pi,pi]. + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, g_XMOne); + select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + sign = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR CEC = g_XMCosCoefficients1; + XMVECTOR vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(3, 3, 3, 3) ); + __m128 Result = _mm_mul_ps(vConstants, x2); + + vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + Result = _mm_add_ps(Result, g_XMOne); + Result = _mm_mul_ps(Result, sign); + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline void XMVectorSinCosEst +( + XMVECTOR* pSin, + XMVECTOR* pCos, + FXMVECTOR V +) +{ + assert(pSin != NULL); + assert(pCos != NULL); + + // 7/6-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Sin; + XMVECTOR Cos; + + XMScalarSinCosEst(&Sin.vector4_f32[0], &Cos.vector4_f32[0], V.vector4_f32[0]); + XMScalarSinCosEst(&Sin.vector4_f32[1], &Cos.vector4_f32[1], V.vector4_f32[1]); + XMScalarSinCosEst(&Sin.vector4_f32[2], &Cos.vector4_f32[2], V.vector4_f32[2]); + XMScalarSinCosEst(&Sin.vector4_f32[3], &Cos.vector4_f32[3], V.vector4_f32[3]); + + *pSin = Sin; + *pCos = Cos; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). + __n128 sign = vandq_u32(x, g_XMNegativeZero); + __n128 c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __n128 absx = vabsq_f32( x ); + __n128 rflx = vsubq_f32(c, x); + __n128 comp = vcleq_f32(absx, g_XMHalfPi); + x = vbslq_f32( comp, x, rflx ); + sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne ); + + __n128 x2 = vmulq_f32(x, x); + + // Compute polynomial approximation for sine + const XMVECTOR SEC = g_XMSinCoefficients1; + XMVECTOR Result = vdupq_lane_f32(vget_high_f32(SEC), 1); + + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + *pSin = vmulq_f32(Result, x); + + // Compute polynomial approximation + const XMVECTOR CEC = g_XMCosCoefficients1; + Result = vdupq_lane_f32(vget_high_f32(CEC), 1); + + vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0); + Result = vmlaq_f32(vConstants, Result, x2); + + vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1); + Result = vmlaq_f32(vConstants, Result, x2); + + Result = vmlaq_f32(g_XMOne, Result, x2); + *pCos = vmulq_f32(Result, sign); +#elif defined(_XM_SSE_INTRINSICS_) + // Force the value within the bounds of pi + XMVECTOR x = XMVectorModAngles(V); + + // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x). + XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); + __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 + __m128 absx = _mm_andnot_ps(sign, x); // |x| + __m128 rflx = _mm_sub_ps(c, x); + __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); + __m128 select0 = _mm_and_ps(comp, x); + __m128 select1 = _mm_andnot_ps(comp, rflx); + x = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, g_XMOne); + select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + sign = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation for sine + const XMVECTOR SEC = g_XMSinCoefficients1; + XMVECTOR vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(3, 3, 3, 3) ); + __m128 Result = _mm_mul_ps(vConstants, x2); + + vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + Result = _mm_add_ps(Result, g_XMOne); + Result = _mm_mul_ps(Result, x); + *pSin = Result; + + // Compute polynomial approximation for cosine + const XMVECTOR CEC = g_XMCosCoefficients1; + vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(3, 3, 3, 3) ); + Result = _mm_mul_ps(vConstants, x2); + + vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + Result = _mm_add_ps(Result, g_XMOne); + Result = _mm_mul_ps(Result, sign); + *pCos = Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorTanEst +( + FXMVECTOR V +) +{ + XMVECTOR OneOverPi = XMVectorSplatW(g_XMTanEstCoefficients.v); + + XMVECTOR V1 = XMVectorMultiply(V, OneOverPi); + V1 = XMVectorRound(V1); + + V1 = XMVectorNegativeMultiplySubtract(g_XMPi.v, V1, V); + + XMVECTOR T0 = XMVectorSplatX(g_XMTanEstCoefficients.v); + XMVECTOR T1 = XMVectorSplatY(g_XMTanEstCoefficients.v); + XMVECTOR T2 = XMVectorSplatZ(g_XMTanEstCoefficients.v); + + XMVECTOR V2T2 = XMVectorNegativeMultiplySubtract(V1, V1, T2); + XMVECTOR V2 = XMVectorMultiply(V1, V1); + XMVECTOR V1T0 = XMVectorMultiply(V1, T0); + XMVECTOR V1T1 = XMVectorMultiply(V1, T1); + + XMVECTOR D = XMVectorReciprocalEst(V2T2); + XMVECTOR N = XMVectorMultiplyAdd(V2, V1T1, V1T0); + + return XMVectorMultiply(N, D); +} + + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorASinEst +( + FXMVECTOR V +) +{ + // 3-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = XMScalarASinEst( V.vector4_f32[0] ); + Result.vector4_f32[1] = XMScalarASinEst( V.vector4_f32[1] ); + Result.vector4_f32[2] = XMScalarASinEst( V.vector4_f32[2] ); + Result.vector4_f32[3] = XMScalarASinEst( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 nonnegative = vcgeq_f32(V, g_XMZero); + __n128 x = vabsq_f32(V); + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + __n128 oneMValue = vsubq_f32(g_XMOne, x); + __n128 clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); + __n128 root = XMVectorSqrt(clampOneMValue); + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMArcEstCoefficients; + __n128 t0 = vdupq_lane_f32(vget_high_f32(AEC), 1); + + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0); + t0 = vmlaq_f32( vConstants, t0, x ); + t0 = vmulq_f32(t0, root); + + __n128 t1 = vsubq_f32(g_XMPi, t0); + t0 = vbslq_f32( nonnegative, t0, t1 ); + t0 = vsubq_f32(g_XMHalfPi, t0); + return t0; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); + __m128 mvalue = _mm_sub_ps(g_XMZero, V); + __m128 x = _mm_max_ps(V, mvalue); // |V| + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + __m128 oneMValue = _mm_sub_ps(g_XMOne, x); + __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); + __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMArcEstCoefficients; + XMVECTOR vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(3, 3, 3, 3) ); + __m128 t0 = _mm_mul_ps(vConstants, x); + + vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(2, 2, 2, 2) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(1, 1, 1, 1) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(0, 0, 0, 0) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, root); + + __m128 t1 = _mm_sub_ps(g_XMPi, t0); + t0 = _mm_and_ps(nonnegative, t0); + t1 = _mm_andnot_ps(nonnegative, t1); + t0 = _mm_or_ps(t0, t1); + t0 = _mm_sub_ps(g_XMHalfPi, t0); + return t0; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorACosEst +( + FXMVECTOR V +) +{ + // 3-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = XMScalarACosEst( V.vector4_f32[0] ); + Result.vector4_f32[1] = XMScalarACosEst( V.vector4_f32[1] ); + Result.vector4_f32[2] = XMScalarACosEst( V.vector4_f32[2] ); + Result.vector4_f32[3] = XMScalarACosEst( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 nonnegative = vcgeq_f32(V, g_XMZero); + __n128 x = vabsq_f32(V); + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + __n128 oneMValue = vsubq_f32(g_XMOne, x); + __n128 clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); + __n128 root = XMVectorSqrt(clampOneMValue); + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMArcEstCoefficients; + __n128 t0 = vdupq_lane_f32(vget_high_f32(AEC), 1); + + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1); + t0 = vmlaq_f32( vConstants, t0, x ); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0); + t0 = vmlaq_f32( vConstants, t0, x ); + t0 = vmulq_f32(t0, root); + + __n128 t1 = vsubq_f32(g_XMPi, t0); + t0 = vbslq_f32( nonnegative, t0, t1 ); + return t0; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); + __m128 mvalue = _mm_sub_ps(g_XMZero, V); + __m128 x = _mm_max_ps(V, mvalue); // |V| + + // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. + __m128 oneMValue = _mm_sub_ps(g_XMOne, x); + __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); + __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMArcEstCoefficients; + XMVECTOR vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(3, 3, 3, 3) ); + __m128 t0 = _mm_mul_ps(vConstants, x); + + vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(2, 2, 2, 2) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(1, 1, 1, 1) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, x); + + vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(0, 0, 0, 0) ); + t0 = _mm_add_ps(t0, vConstants); + t0 = _mm_mul_ps(t0, root); + + __m128 t1 = _mm_sub_ps(g_XMPi, t0); + t0 = _mm_and_ps(nonnegative, t0); + t1 = _mm_andnot_ps(nonnegative, t1); + t0 = _mm_or_ps(t0, t1); + return t0; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +namespace Internal +{ + +inline float XMScalarATanEst +( + float Value +) +{ + float y, sign; + if (fabsf(Value) <= 1.0f) + { + y = Value; + sign = 0.0f; + } + else if (Value > 1.0f) + { + y = 1.0f / Value; + sign = 1.0f; + } + else + { + y = 1.0f / Value; + sign = -1.0f; + } + + // 9-degree minimax approximation + float y2 = y*y; + float poly = ((((0.0208351f*y2-0.085133f)*y2+0.180141f)*y2-0.3302995f)*y2+0.999866f)*y; + + return (sign == 0.0f ? poly : sign*XM_PIDIV2 - poly); +} + +}; // namespace Internal + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorATanEst +( + FXMVECTOR V +) +{ + // 9-degree minimax approximation + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + Result.vector4_f32[0] = Internal::XMScalarATanEst( V.vector4_f32[0] ); + Result.vector4_f32[1] = Internal::XMScalarATanEst( V.vector4_f32[1] ); + Result.vector4_f32[2] = Internal::XMScalarATanEst( V.vector4_f32[2] ); + Result.vector4_f32[3] = Internal::XMScalarATanEst( V.vector4_f32[3] ); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 absV = vabsq_f32(V); + __n128 invV = XMVectorReciprocalEst(V); + __n128 comp = vcgtq_f32(V, g_XMOne); + __n128 sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne ); + comp = vcleq_f32(absV, g_XMOne); + sign = vbslq_f32(comp, g_XMZero, sign ); + __n128 x = vbslq_f32(comp, V, invV ); + + __n128 x2 = vmulq_f32(x, x); + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMATanEstCoefficients1; + __n128 Result = vdupq_lane_f32(vget_high_f32(AEC), 1); + + XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0); + Result = vmlaq_f32( vConstants, Result, x2 ); + + vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1); + Result = vmlaq_f32( vConstants, Result, x2 ); + + vConstants = vdupq_lane_f32(vget_low_f32( AEC), 0); + Result = vmlaq_f32( vConstants, Result, x2 ); + + // ATanEstCoefficients0 is already splatted + Result = vmlaq_f32( g_XMATanEstCoefficients0, Result, x2 ); + Result = vmulq_f32( Result, x ); + + float32x4_t result1 = vmulq_f32(sign, g_XMHalfPi); + result1 = vsubq_f32(result1, Result); + + comp = vceqq_f32(sign, g_XMZero); + Result = vbslq_f32( comp, Result, result1 ); + return Result; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 absV = XMVectorAbs(V); + __m128 invV = _mm_div_ps(g_XMOne, V); + __m128 comp = _mm_cmpgt_ps(V, g_XMOne); + __m128 select0 = _mm_and_ps(comp, g_XMOne); + __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne); + __m128 sign = _mm_or_ps(select0, select1); + comp = _mm_cmple_ps(absV, g_XMOne); + select0 = _mm_and_ps(comp, g_XMZero); + select1 = _mm_andnot_ps(comp, sign); + sign = _mm_or_ps(select0, select1); + select0 = _mm_and_ps(comp, V); + select1 = _mm_andnot_ps(comp, invV); + __m128 x = _mm_or_ps(select0, select1); + + __m128 x2 = _mm_mul_ps(x, x); + + // Compute polynomial approximation + const XMVECTOR AEC = g_XMATanEstCoefficients1; + XMVECTOR vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(3, 3, 3, 3) ); + __m128 Result = _mm_mul_ps(vConstants, x2); + + vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(2, 2, 2, 2) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(1, 1, 1, 1) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(0, 0, 0, 0) ); + Result = _mm_add_ps(Result, vConstants); + Result = _mm_mul_ps(Result, x2); + + // ATanEstCoefficients0 is already splatted + Result = _mm_add_ps(Result, g_XMATanEstCoefficients0); + Result = _mm_mul_ps(Result, x); + __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi); + result1 = _mm_sub_ps(result1, Result); + + comp = _mm_cmpeq_ps(sign, g_XMZero); + select0 = _mm_and_ps(comp, Result); + select1 = _mm_andnot_ps(comp, result1); + Result = _mm_or_ps(select0, select1); + return Result; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorATan2Est +( + FXMVECTOR Y, + FXMVECTOR X +) +{ + static const XMVECTORF32 ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, 2.3561944905f /* Pi*3/4 */}; + + const XMVECTOR Zero = XMVectorZero(); + XMVECTOR ATanResultValid = XMVectorTrueInt(); + + XMVECTOR Pi = XMVectorSplatX(ATan2Constants); + XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants); + XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants); + XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants); + + XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero); + XMVECTOR XEqualsZero = XMVectorEqual(X, Zero); + XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v); + XIsPositive = XMVectorEqualInt(XIsPositive, Zero); + XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y); + XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X); + + XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v); + Pi = XMVectorOrInt(Pi, YSign); + PiOverTwo = XMVectorOrInt(PiOverTwo, YSign); + PiOverFour = XMVectorOrInt(PiOverFour, YSign); + ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign); + + XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive); + XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero); + XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero); + XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive); + XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity); + XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity); + ATanResultValid = XMVectorEqualInt(Result, ATanResultValid); + + XMVECTOR Reciprocal = XMVectorReciprocalEst(X); + XMVECTOR V = XMVectorMultiply(Y, Reciprocal); + XMVECTOR R0 = XMVectorATanEst(V); + + R1 = XMVectorSelect( Pi, Zero, XIsPositive ); + R2 = XMVectorAdd(R0, R1); + + Result = XMVectorSelect(Result, R2, ATanResultValid); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorLerp +( + FXMVECTOR V0, + FXMVECTOR V1, + float t +) +{ + // V0 + t * (V1 - V0) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Scale = XMVectorReplicate(t); + XMVECTOR Length = XMVectorSubtract(V1, V0); + return XMVectorMultiplyAdd(Length, Scale, V0); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR L = vsubq_f32( V1, V0 ); + return vmlaq_n_f32( V0, L, t ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR L = _mm_sub_ps( V1, V0 ); + XMVECTOR S = _mm_set_ps1( t ); + XMVECTOR Result = _mm_mul_ps( L, S ); + return _mm_add_ps( Result, V0 ); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorLerpV +( + FXMVECTOR V0, + FXMVECTOR V1, + FXMVECTOR T +) +{ + // V0 + T * (V1 - V0) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Length = XMVectorSubtract(V1, V0); + return XMVectorMultiplyAdd(Length, T, V0); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR L = vsubq_f32( V1, V0 ); + return vmlaq_f32( V0, L, T ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR Length = _mm_sub_ps( V1, V0 ); + XMVECTOR Result = _mm_mul_ps( Length, T ); + return _mm_add_ps( Result, V0 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorHermite +( + FXMVECTOR Position0, + FXMVECTOR Tangent0, + FXMVECTOR Position1, + GXMVECTOR Tangent1, + float t +) +{ + // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 + + // (t^3 - 2 * t^2 + t) * Tangent0 + + // (-2 * t^3 + 3 * t^2) * Position1 + + // (t^3 - t^2) * Tangent1 + +#if defined(_XM_NO_INTRINSICS_) + + float t2 = t * t; + float t3 = t * t2; + + XMVECTOR P0 = XMVectorReplicate(2.0f * t3 - 3.0f * t2 + 1.0f); + XMVECTOR T0 = XMVectorReplicate(t3 - 2.0f * t2 + t); + XMVECTOR P1 = XMVectorReplicate(-2.0f * t3 + 3.0f * t2); + XMVECTOR T1 = XMVectorReplicate(t3 - t2); + + XMVECTOR Result = XMVectorMultiply(P0, Position0); + Result = XMVectorMultiplyAdd(T0, Tangent0, Result); + Result = XMVectorMultiplyAdd(P1, Position1, Result); + Result = XMVectorMultiplyAdd(T1, Tangent1, Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float t2 = t * t; + float t3 = t * t2; + + XMVECTOR P0 = vdupq_n_f32(2.0f * t3 - 3.0f * t2 + 1.0f); + XMVECTOR T0 = vdupq_n_f32(t3 - 2.0f * t2 + t); + XMVECTOR P1 = vdupq_n_f32(-2.0f * t3 + 3.0f * t2); + XMVECTOR T1 = vdupq_n_f32(t3 - t2); + + XMVECTOR vResult = vmulq_f32(P0, Position0); + vResult = vmlaq_f32( vResult, T0, Tangent0 ); + vResult = vmlaq_f32( vResult, P1, Position1 ); + vResult = vmlaq_f32( vResult, T1, Tangent1 ); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + float t2 = t * t; + float t3 = t * t2; + + XMVECTOR P0 = _mm_set_ps1(2.0f * t3 - 3.0f * t2 + 1.0f); + XMVECTOR T0 = _mm_set_ps1(t3 - 2.0f * t2 + t); + XMVECTOR P1 = _mm_set_ps1(-2.0f * t3 + 3.0f * t2); + XMVECTOR T1 = _mm_set_ps1(t3 - t2); + + XMVECTOR vResult = _mm_mul_ps(P0, Position0); + XMVECTOR vTemp = _mm_mul_ps(T0, Tangent0); + vResult = _mm_add_ps(vResult,vTemp); + vTemp = _mm_mul_ps(P1, Position1); + vResult = _mm_add_ps(vResult,vTemp); + vTemp = _mm_mul_ps(T1, Tangent1); + vResult = _mm_add_ps(vResult,vTemp); + return vResult; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorHermiteV +( + FXMVECTOR Position0, + FXMVECTOR Tangent0, + FXMVECTOR Position1, + GXMVECTOR Tangent1, + CXMVECTOR T +) +{ + // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 + + // (t^3 - 2 * t^2 + t) * Tangent0 + + // (-2 * t^3 + 3 * t^2) * Position1 + + // (t^3 - t^2) * Tangent1 + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR T2 = XMVectorMultiply(T, T); + XMVECTOR T3 = XMVectorMultiply(T , T2); + + XMVECTOR P0 = XMVectorReplicate(2.0f * T3.vector4_f32[0] - 3.0f * T2.vector4_f32[0] + 1.0f); + XMVECTOR T0 = XMVectorReplicate(T3.vector4_f32[1] - 2.0f * T2.vector4_f32[1] + T.vector4_f32[1]); + XMVECTOR P1 = XMVectorReplicate(-2.0f * T3.vector4_f32[2] + 3.0f * T2.vector4_f32[2]); + XMVECTOR T1 = XMVectorReplicate(T3.vector4_f32[3] - T2.vector4_f32[3]); + + XMVECTOR Result = XMVectorMultiply(P0, Position0); + Result = XMVectorMultiplyAdd(T0, Tangent0, Result); + Result = XMVectorMultiplyAdd(P1, Position1, Result); + Result = XMVectorMultiplyAdd(T1, Tangent1, Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 CatMulT2 = {-3.0f,-2.0f,3.0f,-1.0f}; + static const XMVECTORF32 CatMulT3 = {2.0f,1.0f,-2.0f,1.0f}; + + XMVECTOR T2 = vmulq_f32(T,T); + XMVECTOR T3 = vmulq_f32(T,T2); + // Mul by the constants against t^2 + T2 = vmulq_f32(T2,CatMulT2); + // Mul by the constants against t^3 + T3 = vmlaq_f32(T2, T3, CatMulT3 ); + // T3 now has the pre-result. + // I need to add t.y only + T2 = vandq_u32(T,g_XMMaskY); + T3 = vaddq_f32(T3,T2); + // Add 1.0f to x + T3 = vaddq_f32(T3,g_XMIdentityR0); + // Now, I have the constants created + // Mul the x constant to Position0 + XMVECTOR vResult = vdupq_lane_f32( vget_low_f32( T3 ), 0 ); // T3[0] + vResult = vmulq_f32(vResult,Position0); + // Mul the y constant to Tangent0 + T2 = vdupq_lane_f32( vget_low_f32( T3 ), 1 ); // T3[1] + vResult = vmlaq_f32(vResult, T2, Tangent0 ); + // Mul the z constant to Position1 + T2 = vdupq_lane_f32( vget_high_f32( T3 ), 0 ); // T3[2] + vResult = vmlaq_f32(vResult, T2, Position1 ); + // Mul the w constant to Tangent1 + T3 = vdupq_lane_f32( vget_high_f32( T3 ), 1 ); // T3[3] + vResult = vmlaq_f32(vResult, T3, Tangent1 ); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 CatMulT2 = {-3.0f,-2.0f,3.0f,-1.0f}; + static const XMVECTORF32 CatMulT3 = {2.0f,1.0f,-2.0f,1.0f}; + + XMVECTOR T2 = _mm_mul_ps(T,T); + XMVECTOR T3 = _mm_mul_ps(T,T2); + // Mul by the constants against t^2 + T2 = _mm_mul_ps(T2,CatMulT2); + // Mul by the constants against t^3 + T3 = _mm_mul_ps(T3,CatMulT3); + // T3 now has the pre-result. + T3 = _mm_add_ps(T3,T2); + // I need to add t.y only + T2 = _mm_and_ps(T,g_XMMaskY); + T3 = _mm_add_ps(T3,T2); + // Add 1.0f to x + T3 = _mm_add_ps(T3,g_XMIdentityR0); + // Now, I have the constants created + // Mul the x constant to Position0 + XMVECTOR vResult = XM_PERMUTE_PS(T3,_MM_SHUFFLE(0,0,0,0)); + vResult = _mm_mul_ps(vResult,Position0); + // Mul the y constant to Tangent0 + T2 = XM_PERMUTE_PS(T3,_MM_SHUFFLE(1,1,1,1)); + T2 = _mm_mul_ps(T2,Tangent0); + vResult = _mm_add_ps(vResult,T2); + // Mul the z constant to Position1 + T2 = XM_PERMUTE_PS(T3,_MM_SHUFFLE(2,2,2,2)); + T2 = _mm_mul_ps(T2,Position1); + vResult = _mm_add_ps(vResult,T2); + // Mul the w constant to Tangent1 + T3 = XM_PERMUTE_PS(T3,_MM_SHUFFLE(3,3,3,3)); + T3 = _mm_mul_ps(T3,Tangent1); + vResult = _mm_add_ps(vResult,T3); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorCatmullRom +( + FXMVECTOR Position0, + FXMVECTOR Position1, + FXMVECTOR Position2, + GXMVECTOR Position3, + float t +) +{ + // Result = ((-t^3 + 2 * t^2 - t) * Position0 + + // (3 * t^3 - 5 * t^2 + 2) * Position1 + + // (-3 * t^3 + 4 * t^2 + t) * Position2 + + // (t^3 - t^2) * Position3) * 0.5 + +#if defined(_XM_NO_INTRINSICS_) + + float t2 = t * t; + float t3 = t * t2; + + XMVECTOR P0 = XMVectorReplicate((-t3 + 2.0f * t2 - t) * 0.5f); + XMVECTOR P1 = XMVectorReplicate((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f); + XMVECTOR P2 = XMVectorReplicate((-3.0f * t3 + 4.0f * t2 + t) * 0.5f); + XMVECTOR P3 = XMVectorReplicate((t3 - t2) * 0.5f); + + XMVECTOR Result = XMVectorMultiply(P0, Position0); + Result = XMVectorMultiplyAdd(P1, Position1, Result); + Result = XMVectorMultiplyAdd(P2, Position2, Result); + Result = XMVectorMultiplyAdd(P3, Position3, Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + float t2 = t * t; + float t3 = t * t2; + + XMVECTOR P0 = vdupq_n_f32((-t3 + 2.0f * t2 - t) * 0.5f); + XMVECTOR P1 = vdupq_n_f32((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f); + XMVECTOR P2 = vdupq_n_f32((-3.0f * t3 + 4.0f * t2 + t) * 0.5f); + XMVECTOR P3 = vdupq_n_f32((t3 - t2) * 0.5f); + + P1 = vmulq_f32(P1, Position1); + P0 = vmlaq_f32(P1, P0, Position0); + P3 = vmulq_f32(P3, Position3); + P2 = vmlaq_f32(P3, P2, Position2); + P0 = vaddq_f32(P0,P2); + return P0; +#elif defined(_XM_SSE_INTRINSICS_) + float t2 = t * t; + float t3 = t * t2; + + XMVECTOR P0 = _mm_set_ps1((-t3 + 2.0f * t2 - t) * 0.5f); + XMVECTOR P1 = _mm_set_ps1((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f); + XMVECTOR P2 = _mm_set_ps1((-3.0f * t3 + 4.0f * t2 + t) * 0.5f); + XMVECTOR P3 = _mm_set_ps1((t3 - t2) * 0.5f); + + P0 = _mm_mul_ps(P0, Position0); + P1 = _mm_mul_ps(P1, Position1); + P2 = _mm_mul_ps(P2, Position2); + P3 = _mm_mul_ps(P3, Position3); + P0 = _mm_add_ps(P0,P1); + P2 = _mm_add_ps(P2,P3); + P0 = _mm_add_ps(P0,P2); + return P0; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorCatmullRomV +( + FXMVECTOR Position0, + FXMVECTOR Position1, + FXMVECTOR Position2, + GXMVECTOR Position3, + CXMVECTOR T +) +{ +#if defined(_XM_NO_INTRINSICS_) + float fx = T.vector4_f32[0]; + float fy = T.vector4_f32[1]; + float fz = T.vector4_f32[2]; + float fw = T.vector4_f32[3]; + XMVECTOR vResult = { + 0.5f*((-fx*fx*fx+2*fx*fx-fx)*Position0.vector4_f32[0]+ + (3*fx*fx*fx-5*fx*fx+2)*Position1.vector4_f32[0]+ + (-3*fx*fx*fx+4*fx*fx+fx)*Position2.vector4_f32[0]+ + (fx*fx*fx-fx*fx)*Position3.vector4_f32[0]), + 0.5f*((-fy*fy*fy+2*fy*fy-fy)*Position0.vector4_f32[1]+ + (3*fy*fy*fy-5*fy*fy+2)*Position1.vector4_f32[1]+ + (-3*fy*fy*fy+4*fy*fy+fy)*Position2.vector4_f32[1]+ + (fy*fy*fy-fy*fy)*Position3.vector4_f32[1]), + 0.5f*((-fz*fz*fz+2*fz*fz-fz)*Position0.vector4_f32[2]+ + (3*fz*fz*fz-5*fz*fz+2)*Position1.vector4_f32[2]+ + (-3*fz*fz*fz+4*fz*fz+fz)*Position2.vector4_f32[2]+ + (fz*fz*fz-fz*fz)*Position3.vector4_f32[2]), + 0.5f*((-fw*fw*fw+2*fw*fw-fw)*Position0.vector4_f32[3]+ + (3*fw*fw*fw-5*fw*fw+2)*Position1.vector4_f32[3]+ + (-3*fw*fw*fw+4*fw*fw+fw)*Position2.vector4_f32[3]+ + (fw*fw*fw-fw*fw)*Position3.vector4_f32[3]) + }; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Catmul2 = {2.0f,2.0f,2.0f,2.0f}; + static const XMVECTORF32 Catmul3 = {3.0f,3.0f,3.0f,3.0f}; + static const XMVECTORF32 Catmul4 = {4.0f,4.0f,4.0f,4.0f}; + static const XMVECTORF32 Catmul5 = {5.0f,5.0f,5.0f,5.0f}; + // Cache T^2 and T^3 + XMVECTOR T2 = vmulq_f32(T,T); + XMVECTOR T3 = vmulq_f32(T,T2); + // Perform the Position0 term + XMVECTOR vResult = vaddq_f32(T2,T2); + vResult = vsubq_f32(vResult,T); + vResult = vsubq_f32(vResult,T3); + vResult = vmulq_f32(vResult,Position0); + // Perform the Position1 term and add + XMVECTOR vTemp = vmulq_f32(T3,Catmul3); + vTemp = vmlsq_f32(vTemp, T2, Catmul5); + vTemp = vaddq_f32(vTemp,Catmul2); + vResult = vmlaq_f32(vResult, vTemp, Position1); + // Perform the Position2 term and add + vTemp = vmulq_f32(T2,Catmul4); + vTemp = vmlsq_f32(vTemp, T3, Catmul3); + vTemp = vaddq_f32(vTemp,T); + vResult = vmlaq_f32(vResult, vTemp, Position2); + // Position3 is the last term + T3 = vsubq_f32(T3,T2); + vResult = vmlaq_f32(vResult, T3, Position3); + // Multiply by 0.5f and exit + vResult = vmulq_f32(vResult,g_XMOneHalf); + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Catmul2 = {2.0f,2.0f,2.0f,2.0f}; + static const XMVECTORF32 Catmul3 = {3.0f,3.0f,3.0f,3.0f}; + static const XMVECTORF32 Catmul4 = {4.0f,4.0f,4.0f,4.0f}; + static const XMVECTORF32 Catmul5 = {5.0f,5.0f,5.0f,5.0f}; + // Cache T^2 and T^3 + XMVECTOR T2 = _mm_mul_ps(T,T); + XMVECTOR T3 = _mm_mul_ps(T,T2); + // Perform the Position0 term + XMVECTOR vResult = _mm_add_ps(T2,T2); + vResult = _mm_sub_ps(vResult,T); + vResult = _mm_sub_ps(vResult,T3); + vResult = _mm_mul_ps(vResult,Position0); + // Perform the Position1 term and add + XMVECTOR vTemp = _mm_mul_ps(T3,Catmul3); + XMVECTOR vTemp2 = _mm_mul_ps(T2,Catmul5); + vTemp = _mm_sub_ps(vTemp,vTemp2); + vTemp = _mm_add_ps(vTemp,Catmul2); + vTemp = _mm_mul_ps(vTemp,Position1); + vResult = _mm_add_ps(vResult,vTemp); + // Perform the Position2 term and add + vTemp = _mm_mul_ps(T2,Catmul4); + vTemp2 = _mm_mul_ps(T3,Catmul3); + vTemp = _mm_sub_ps(vTemp,vTemp2); + vTemp = _mm_add_ps(vTemp,T); + vTemp = _mm_mul_ps(vTemp,Position2); + vResult = _mm_add_ps(vResult,vTemp); + // Position3 is the last term + T3 = _mm_sub_ps(T3,T2); + T3 = _mm_mul_ps(T3,Position3); + vResult = _mm_add_ps(vResult,T3); + // Multiply by 0.5f and exit + vResult = _mm_mul_ps(vResult,g_XMOneHalf); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorBaryCentric +( + FXMVECTOR Position0, + FXMVECTOR Position1, + FXMVECTOR Position2, + float f, + float g +) +{ + // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR P10 = XMVectorSubtract(Position1, Position0); + XMVECTOR ScaleF = XMVectorReplicate(f); + + XMVECTOR P20 = XMVectorSubtract(Position2, Position0); + XMVECTOR ScaleG = XMVectorReplicate(g); + + XMVECTOR Result = XMVectorMultiplyAdd(P10, ScaleF, Position0); + Result = XMVectorMultiplyAdd(P20, ScaleG, Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR R1 = vsubq_f32(Position1,Position0); + XMVECTOR SF = vdupq_n_f32(f); + XMVECTOR R2 = vsubq_f32(Position2,Position0); + XMVECTOR SG = vdupq_n_f32(g); + R1 = vmlaq_f32( Position0, R1, SF); + return vmlaq_f32( R1, R2, SG ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR R1 = _mm_sub_ps(Position1,Position0); + XMVECTOR SF = _mm_set_ps1(f); + XMVECTOR R2 = _mm_sub_ps(Position2,Position0); + XMVECTOR SG = _mm_set_ps1(g); + R1 = _mm_mul_ps(R1,SF); + R2 = _mm_mul_ps(R2,SG); + R1 = _mm_add_ps(R1,Position0); + R1 = _mm_add_ps(R1,R2); + return R1; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVectorBaryCentricV +( + FXMVECTOR Position0, + FXMVECTOR Position1, + FXMVECTOR Position2, + GXMVECTOR F, + CXMVECTOR G +) +{ + // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0) + +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR P10 = XMVectorSubtract(Position1, Position0); + XMVECTOR P20 = XMVectorSubtract(Position2, Position0); + + XMVECTOR Result = XMVectorMultiplyAdd(P10, F, Position0); + Result = XMVectorMultiplyAdd(P20, G, Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR R1 = vsubq_f32(Position1,Position0); + XMVECTOR R2 = vsubq_f32(Position2,Position0); + R1 = vmlaq_f32( Position0, R1, F ); + return vmlaq_f32( R1, R2, G); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR R1 = _mm_sub_ps(Position1,Position0); + XMVECTOR R2 = _mm_sub_ps(Position2,Position0); + R1 = _mm_mul_ps(R1,F); + R2 = _mm_mul_ps(R2,G); + R1 = _mm_add_ps(R1,Position0); + R1 = _mm_add_ps(R1,R2); + return R1; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +/**************************************************************************** + * + * 2D Vector + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline bool XMVector2Equal +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 vTemp = vceq_f32( vget_low_f32(V1), vget_low_f32(V2) ); + return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); +// z and w are don't care + return (((_mm_movemask_ps(vTemp)&3)==3) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAllTrue(XMVector2EqualR(V1, V2)); +#endif +} + + +//------------------------------------------------------------------------------ + +inline uint32_t XMVector2EqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && + (V1.vector4_f32[1] == V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && + (V1.vector4_f32[1] != V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 vTemp = vceq_f32( vget_low_f32(V1), vget_low_f32(V2) ); + uint64_t r = vget_lane_u64( vTemp, 0 ); + uint32_t CR = 0; + if ( r == 0xFFFFFFFFFFFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); +// z and w are don't care + int iTest = _mm_movemask_ps(vTemp)&3; + uint32_t CR = 0; + if (iTest==3) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline bool XMVector2EqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 vTemp = vceq_u32( vget_low_u32(V1), vget_low_u32(V2) ); + return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); + return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&3)==3) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAllTrue(XMVector2EqualIntR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XMVector2EqualIntR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + if ((V1.vector4_u32[0] == V2.vector4_u32[0]) && + (V1.vector4_u32[1] == V2.vector4_u32[1])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) && + (V1.vector4_u32[1] != V2.vector4_u32[1])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 vTemp = vceq_u32( vget_low_u32(V1), vget_low_u32(V2) ); + uint64_t r = vget_lane_u64( vTemp, 0 ); + uint32_t CR = 0; + if ( r == 0xFFFFFFFFFFFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); + int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp))&3; + uint32_t CR = 0; + if (iTest==3) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline bool XMVector2NearEqual +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Epsilon +) +{ +#if defined(_XM_NO_INTRINSICS_) + float dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]); + float dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]); + return ((dx <= Epsilon.vector4_f32[0]) && + (dy <= Epsilon.vector4_f32[1])); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 vDelta = vsub_f32(vget_low_u32(V1), vget_low_u32(V2)); + __n64 vTemp = vacle_f32( vDelta, vget_low_u32(Epsilon) ); + uint64_t r = vget_lane_u64( vTemp, 0 ); + return ( r == 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + // Get the difference + XMVECTOR vDelta = _mm_sub_ps(V1,V2); + // Get the absolute value of the difference + XMVECTOR vTemp = _mm_setzero_ps(); + vTemp = _mm_sub_ps(vTemp,vDelta); + vTemp = _mm_max_ps(vTemp,vDelta); + vTemp = _mm_cmple_ps(vTemp,Epsilon); + // z and w are don't care + return (((_mm_movemask_ps(vTemp)&3)==0x3) != 0); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline bool XMVector2NotEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 vTemp = vceq_f32( vget_low_f32(V1), vget_low_f32(V2) ); + return ( vget_lane_u64( vTemp, 0 ) != 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); +// z and w are don't care + return (((_mm_movemask_ps(vTemp)&3)!=3) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAnyFalse(XMVector2EqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XMVector2NotEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 vTemp = vceq_u32( vget_low_u32(V1), vget_low_u32(V2) ); + return ( vget_lane_u64( vTemp, 0 ) != 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); + return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&3)!=3) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAnyFalse(XMVector2EqualIntR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XMVector2Greater +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 vTemp = vcgt_f32( vget_low_f32(V1), vget_low_f32(V2) ); + return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); +// z and w are don't care + return (((_mm_movemask_ps(vTemp)&3)==3) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAllTrue(XMVector2GreaterR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XMVector2GreaterR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + if ((V1.vector4_f32[0] > V2.vector4_f32[0]) && + (V1.vector4_f32[1] > V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) && + (V1.vector4_f32[1] <= V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 vTemp = vcgt_f32( vget_low_f32(V1), vget_low_f32(V2) ); + uint64_t r = vget_lane_u64( vTemp, 0 ); + uint32_t CR = 0; + if ( r == 0xFFFFFFFFFFFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); + int iTest = _mm_movemask_ps(vTemp)&3; + uint32_t CR = 0; + if (iTest==3) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline bool XMVector2GreaterOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 vTemp = vcge_f32( vget_low_f32(V1), vget_low_f32(V2) ); + return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&3)==3) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAllTrue(XMVector2GreaterOrEqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XMVector2GreaterOrEqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && + (V1.vector4_f32[1] >= V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && + (V1.vector4_f32[1] < V2.vector4_f32[1])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 vTemp = vcge_f32( vget_low_f32(V1), vget_low_f32(V2) ); + uint64_t r = vget_lane_u64( vTemp, 0 ); + uint32_t CR = 0; + if ( r == 0xFFFFFFFFFFFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); + int iTest = _mm_movemask_ps(vTemp)&3; + uint32_t CR = 0; + if (iTest == 3) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline bool XMVector2Less +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 vTemp = vclt_f32( vget_low_f32(V1), vget_low_f32(V2) ); + return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmplt_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&3)==3) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAllTrue(XMVector2GreaterR(V2, V1)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XMVector2LessOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 vTemp = vcle_f32( vget_low_f32(V1), vget_low_f32(V2) ); + return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmple_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&3)==3) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAllTrue(XMVector2GreaterOrEqualR(V2, V1)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XMVector2InBounds +( + FXMVECTOR V, + FXMVECTOR Bounds +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && + (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 VL = vget_low_f32( V ); + __n64 B = vget_low_f32( Bounds ); + // Test if less than or equal + __n64 vTemp1 = vcle_f32(VL,B); + // Negate the bounds + __n64 vTemp2 = vneg_f32(B); + // Test if greater or equal (Reversed) + vTemp2 = vcle_f32(vTemp2,VL); + // Blend answers + vTemp1 = vand_u32(vTemp1,vTemp2); + // x and y in bounds? + return ( vget_lane_u64( vTemp1, 0 ) == 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2,V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1,vTemp2); + // x and y in bounds? (z and w are don't care) + return (((_mm_movemask_ps(vTemp1)&0x3)==0x3) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAllInBounds(XMVector2InBoundsR(V, Bounds)); +#endif +} + + +//------------------------------------------------------------------------------ + +inline bool XMVector2IsNaN +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (XMISNAN(V.vector4_f32[0]) || + XMISNAN(V.vector4_f32[1])); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 VL = vget_low_f32( V ); + // Test against itself. NaN is always not equal + __n64 vTempNan = vceq_f32( VL, VL ); + // If x or y are NaN, the mask is zero + return ( vget_lane_u64( vTempNan, 0 ) != 0xFFFFFFFFFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + // Test against itself. NaN is always not equal + XMVECTOR vTempNan = _mm_cmpneq_ps(V,V); + // If x or y are NaN, the mask is non-zero + return ((_mm_movemask_ps(vTempNan)&3) != 0); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline bool XMVector2IsInfinite +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + return (XMISINF(V.vector4_f32[0]) || + XMISINF(V.vector4_f32[1])); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Mask off the sign bit + __n64 vTemp = vand_u32( vget_low_f32( V ) , vget_low_f32( g_XMAbsMask ) ); + // Compare to infinity + vTemp = vceq_f32(vTemp, vget_low_f32( g_XMInfinity) ); + // If any are infinity, the signs are true. + return vget_lane_u64( vTemp, 0 ) != 0; +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bit + __m128 vTemp = _mm_and_ps(V,g_XMAbsMask); + // Compare to infinity + vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity); + // If x or z are infinity, the signs are true. + return ((_mm_movemask_ps(vTemp)&3) != 0); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector2Dot +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_f32[0] = + Result.vector4_f32[1] = + Result.vector4_f32[2] = + Result.vector4_f32[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1]; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Perform the dot product on x and y + __n64 vTemp = vmul_f32( vget_low_f32(V1), vget_low_f32(V2) ); + vTemp = vpadd_f32( vTemp, vTemp ); + return vcombine_f32( vTemp, vTemp ); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V1,V2); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + return vLengthSq; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector2Cross +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + // [ V1.x*V2.y - V1.y*V2.x, V1.x*V2.y - V1.y*V2.x ] + +#if defined(_XM_NO_INTRINSICS_) + float fCross = (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]); + XMVECTOR vResult = { + fCross, + fCross, + fCross, + fCross + }; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Negate = { 1.f, -1.f, 0, 0 }; + + __n64 vTemp = vmul_f32( vget_low_f32( V1 ), vrev64_f32( vget_low_f32( V2 ) ) ); + vTemp = vmul_f32( vTemp, vget_low_f32( Negate ) ); + vTemp = vpadd_f32( vTemp, vTemp ); + return vcombine_f32( vTemp, vTemp ); +#elif defined(_XM_SSE_INTRINSICS_) + // Swap x and y + XMVECTOR vResult = XM_PERMUTE_PS(V2,_MM_SHUFFLE(0,1,0,1)); + // Perform the muls + vResult = _mm_mul_ps(vResult,V1); + // Splat y + XMVECTOR vTemp = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(1,1,1,1)); + // Sub the values + vResult = _mm_sub_ss(vResult,vTemp); + // Splat the cross product + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,0,0,0)); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector2LengthSq +( + FXMVECTOR V +) +{ + return XMVector2Dot(V, V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector2ReciprocalLengthEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2LengthSq(V); + Result = XMVectorReciprocalSqrtEst(Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 VL = vget_low_f32(V); + // Dot2 + __n64 vTemp = vmul_f32( VL, VL ); + vTemp = vpadd_f32( vTemp, vTemp ); + // Reciprocal sqrt (estimate) + vTemp = vrsqrte_f32( vTemp ); + return vcombine_f32( vTemp, vTemp ); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = _mm_rsqrt_ss(vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + return vLengthSq; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector2ReciprocalLength +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2LengthSq(V); + Result = XMVectorReciprocalSqrt(Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 VL = vget_low_f32(V); + // Dot2 + __n64 vTemp = vmul_f32( VL, VL ); + vTemp = vpadd_f32( vTemp, vTemp ); + // Reciprocal sqrt + __n64 S0 = vrsqrte_f32(vTemp); + __n64 P0 = vmul_f32( vTemp, S0 ); + __n64 R0 = vrsqrts_f32( P0, S0 ); + __n64 S1 = vmul_f32( S0, R0 ); + __n64 P1 = vmul_f32( vTemp, S1 ); + __n64 R1 = vrsqrts_f32( P1, S1 ); + __n64 Result = vmul_f32( S1, R1 ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = _mm_sqrt_ss(vLengthSq); + vLengthSq = _mm_div_ss(g_XMOne,vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + return vLengthSq; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector2LengthEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2LengthSq(V); + Result = XMVectorSqrtEst(Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 VL = vget_low_f32(V); + // Dot2 + __n64 vTemp = vmul_f32( VL, VL ); + vTemp = vpadd_f32( vTemp, vTemp ); + const __n64 zero = vdup_n_u32(0); + __n64 VEqualsZero = vceq_f32( vTemp, zero ); + // Sqrt (estimate) + __n64 Result = vrsqrte_f32( vTemp ); + Result = vmul_f32( vTemp, Result ); + Result = vbsl_f32( VEqualsZero, zero, Result ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = _mm_sqrt_ss(vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + return vLengthSq; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector2Length +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2LengthSq(V); + Result = XMVectorSqrt(Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 VL = vget_low_f32(V); + // Dot2 + __n64 vTemp = vmul_f32( VL, VL ); + vTemp = vpadd_f32( vTemp, vTemp ); + const __n64 zero = vdup_n_u32(0); + __n64 VEqualsZero = vceq_f32( vTemp, zero ); + // Sqrt + __n64 S0 = vrsqrte_f32( vTemp ); + __n64 P0 = vmul_f32( vTemp, S0 ); + __n64 R0 = vrsqrts_f32( P0, S0 ); + __n64 S1 = vmul_f32( S0, R0 ); + __n64 P1 = vmul_f32( vTemp, S1 ); + __n64 R1 = vrsqrts_f32( P1, S1 ); + __n64 Result = vmul_f32( S1, R1 ); + Result = vmul_f32( vTemp, Result ); + Result = vbsl_f32( VEqualsZero, zero, Result ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// XMVector2NormalizeEst uses a reciprocal estimate and +// returns QNaN on zero and infinite vectors. + +inline XMVECTOR XMVector2NormalizeEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector2ReciprocalLength(V); + Result = XMVectorMultiply(V, Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 VL = vget_low_f32(V); + // Dot2 + __n64 vTemp = vmul_f32( VL, VL ); + vTemp = vpadd_f32( vTemp, vTemp ); + // Reciprocal sqrt (estimate) + vTemp = vrsqrte_f32( vTemp ); + // Normalize + __n64 Result = vmul_f32( VL, vTemp ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has y splatted + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); + // x+y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = _mm_rsqrt_ss(vLengthSq); + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + vLengthSq = _mm_mul_ps(vLengthSq,V); + return vLengthSq; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector2Normalize +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR vResult = XMVector2Length( V ); + float fLength = vResult.vector4_f32[0]; + + // Prevent divide by zero + if (fLength > 0) { + fLength = 1.0f/fLength; + } + + vResult.vector4_f32[0] = V.vector4_f32[0]*fLength; + vResult.vector4_f32[1] = V.vector4_f32[1]*fLength; + vResult.vector4_f32[2] = V.vector4_f32[2]*fLength; + vResult.vector4_f32[3] = V.vector4_f32[3]*fLength; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 VL = vget_low_f32(V); + // Dot2 + __n64 vTemp = vmul_f32( VL, VL ); + vTemp = vpadd_f32( vTemp, vTemp ); + __n64 VEqualsZero = vceq_f32( vTemp, vdup_n_u32(0) ); + __n64 VEqualsInf = vceq_f32( vTemp, vget_low_f32(g_XMInfinity) ); + // Reciprocal sqrt (2 iterations of Newton-Raphson) + __n64 S0 = vrsqrte_f32( vTemp ); + __n64 P0 = vmul_f32( vTemp, S0 ); + __n64 R0 = vrsqrts_f32( P0, S0 ); + __n64 S1 = vmul_f32( S0, R0 ); + __n64 P1 = vmul_f32( vTemp, S1 ); + __n64 R1 = vrsqrts_f32( P1, S1 ); + vTemp = vmul_f32( S1, R1 ); + // Normalize + __n64 Result = vmul_f32( VL, vTemp ); + Result = vbsl_f32( VEqualsZero, vdup_n_f32(0), Result ); + Result = vbsl_f32( VEqualsInf, vget_low_f32(g_XMQNaN), Result ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x and y only + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Reciprocal mul to perform the normalization + vResult = _mm_div_ps(V,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); + vResult = _mm_or_ps(vTemp1,vTemp2); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector2ClampLength +( + FXMVECTOR V, + float LengthMin, + float LengthMax +) +{ + XMVECTOR ClampMax = XMVectorReplicate(LengthMax); + XMVECTOR ClampMin = XMVectorReplicate(LengthMin); + return XMVector2ClampLengthV(V, ClampMin, ClampMax); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector2ClampLengthV +( + FXMVECTOR V, + FXMVECTOR LengthMin, + FXMVECTOR LengthMax +) +{ + assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin))); + assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax))); + assert(XMVector2GreaterOrEqual(LengthMin, g_XMZero)); + assert(XMVector2GreaterOrEqual(LengthMax, g_XMZero)); + assert(XMVector2GreaterOrEqual(LengthMax, LengthMin)); + + XMVECTOR LengthSq = XMVector2LengthSq(V); + + const XMVECTOR Zero = XMVectorZero(); + + XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq); + + XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v); + XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero); + + XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength); + + XMVECTOR Normal = XMVectorMultiply(V, RcpLength); + + XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength); + Length = XMVectorSelect(LengthSq, Length, Select); + Normal = XMVectorSelect(LengthSq, Normal, Select); + + XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax); + XMVECTOR ControlMin = XMVectorLess(Length, LengthMin); + + XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax); + ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin); + + XMVECTOR Result = XMVectorMultiply(Normal, ClampLength); + + // Preserve the original vector (with no precision loss) if the length falls within the given range + XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin); + Result = XMVectorSelect(Result, V, Control); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector2Reflect +( + FXMVECTOR Incident, + FXMVECTOR Normal +) +{ + // Result = Incident - (2 * dot(Incident, Normal)) * Normal + + XMVECTOR Result; + Result = XMVector2Dot(Incident, Normal); + Result = XMVectorAdd(Result, Result); + Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident); + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector2Refract +( + FXMVECTOR Incident, + FXMVECTOR Normal, + float RefractionIndex +) +{ + XMVECTOR Index = XMVectorReplicate(RefractionIndex); + return XMVector2RefractV(Incident, Normal, Index); +} + +//------------------------------------------------------------------------------ + +// Return the refraction of a 2D vector +inline XMVECTOR XMVector2RefractV +( + FXMVECTOR Incident, + FXMVECTOR Normal, + FXMVECTOR RefractionIndex +) +{ + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + +#if defined(_XM_NO_INTRINSICS_) + + float IDotN = (Incident.vector4_f32[0]*Normal.vector4_f32[0])+(Incident.vector4_f32[1]*Normal.vector4_f32[1]); + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + float RY = 1.0f-(IDotN*IDotN); + float RX = 1.0f-(RY*RefractionIndex.vector4_f32[0]*RefractionIndex.vector4_f32[0]); + RY = 1.0f-(RY*RefractionIndex.vector4_f32[1]*RefractionIndex.vector4_f32[1]); + if (RX>=0.0f) { + RX = (RefractionIndex.vector4_f32[0]*Incident.vector4_f32[0])-(Normal.vector4_f32[0]*((RefractionIndex.vector4_f32[0]*IDotN)+sqrtf(RX))); + } else { + RX = 0.0f; + } + if (RY>=0.0f) { + RY = (RefractionIndex.vector4_f32[1]*Incident.vector4_f32[1])-(Normal.vector4_f32[1]*((RefractionIndex.vector4_f32[1]*IDotN)+sqrtf(RY))); + } else { + RY = 0.0f; + } + + XMVECTOR vResult; + vResult.vector4_f32[0] = RX; + vResult.vector4_f32[1] = RY; + vResult.vector4_f32[2] = 0.0f; + vResult.vector4_f32[3] = 0.0f; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 IL = vget_low_f32( Incident ); + __n64 NL = vget_low_f32( Normal ); + __n64 RIL = vget_low_f32( RefractionIndex ); + // Get the 2D Dot product of Incident-Normal + __n64 vTemp = vmul_f32(IL, NL); + __n64 IDotN = vpadd_f32( vTemp, vTemp ); + // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + vTemp = vmls_f32( vget_low_f32( g_XMOne ), IDotN, IDotN); + vTemp = vmul_f32(vTemp,RIL); + vTemp = vmls_f32(vget_low_f32( g_XMOne ), vTemp, RIL ); + // If any terms are <=0, sqrt() will fail, punt to zero + __n64 vMask = vcgt_f32(vTemp, vget_low_f32(g_XMZero) ); + // Sqrt(vTemp) + __n64 S0 = vrsqrte_f32(vTemp); + __n64 P0 = vmul_f32( vTemp, S0 ); + __n64 R0 = vrsqrts_f32( P0, S0 ); + __n64 S1 = vmul_f32( S0, R0 ); + __n64 P1 = vmul_f32( vTemp, S1 ); + __n64 R1 = vrsqrts_f32( P1, S1 ); + __n64 S2 = vmul_f32( S1, R1 ); + vTemp = vmul_f32( vTemp, S2 ); + // R = RefractionIndex * IDotN + sqrt(R) + vTemp = vmla_f32( vTemp, RIL, IDotN ); + // Result = RefractionIndex * Incident - Normal * R + __n64 vResult = vmul_f32(RIL,IL); + vResult = vmls_f32( vResult, vTemp, NL ); + vResult = vand_u32(vResult,vMask); + return vcombine_f32(vResult, vResult); +#elif defined(_XM_SSE_INTRINSICS_) + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + // Get the 2D Dot product of Incident-Normal + XMVECTOR IDotN = XMVector2Dot(Incident, Normal); + // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + XMVECTOR vTemp = _mm_mul_ps(IDotN,IDotN); + vTemp = _mm_sub_ps(g_XMOne,vTemp); + vTemp = _mm_mul_ps(vTemp,RefractionIndex); + vTemp = _mm_mul_ps(vTemp,RefractionIndex); + vTemp = _mm_sub_ps(g_XMOne,vTemp); + // If any terms are <=0, sqrt() will fail, punt to zero + XMVECTOR vMask = _mm_cmpgt_ps(vTemp,g_XMZero); + // R = RefractionIndex * IDotN + sqrt(R) + vTemp = _mm_sqrt_ps(vTemp); + XMVECTOR vResult = _mm_mul_ps(RefractionIndex,IDotN); + vTemp = _mm_add_ps(vTemp,vResult); + // Result = RefractionIndex * Incident - Normal * R + vResult = _mm_mul_ps(RefractionIndex,Incident); + vTemp = _mm_mul_ps(vTemp,Normal); + vResult = _mm_sub_ps(vResult,vTemp); + vResult = _mm_and_ps(vResult,vMask); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector2Orthogonal +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_f32[0] = -V.vector4_f32[1]; + Result.vector4_f32[1] = V.vector4_f32[0]; + Result.vector4_f32[2] = 0.f; + Result.vector4_f32[3] = 0.f; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Negate = { -1.f, 1.f, 0, 0 }; + const __n64 zero = vdup_n_f32(0); + + __n64 VL = vget_low_f32( V ); + __n64 Result = vmul_f32( vrev64_f32( VL ), vget_low_f32( Negate ) ); + return vcombine_f32( Result, zero ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1)); + vResult = _mm_mul_ps(vResult,g_XMNegateX); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector2AngleBetweenNormalsEst +( + FXMVECTOR N1, + FXMVECTOR N2 +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR Result = XMVector2Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); + Result = XMVectorACosEst(Result); + return Result; + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector2AngleBetweenNormals +( + FXMVECTOR N1, + FXMVECTOR N2 +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR Result = XMVector2Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne, g_XMOne); + Result = XMVectorACos(Result); + return Result; + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector2AngleBetweenVectors +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR L1 = XMVector2ReciprocalLength(V1); + XMVECTOR L2 = XMVector2ReciprocalLength(V2); + + XMVECTOR Dot = XMVector2Dot(V1, V2); + + L1 = XMVectorMultiply(L1, L2); + + XMVECTOR CosAngle = XMVectorMultiply(Dot, L1); + CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v); + + return XMVectorACos(CosAngle); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector2LinePointDistance +( + FXMVECTOR LinePoint1, + FXMVECTOR LinePoint2, + FXMVECTOR Point +) +{ + // Given a vector PointVector from LinePoint1 to Point and a vector + // LineVector from LinePoint1 to LinePoint2, the scaled distance + // PointProjectionScale from LinePoint1 to the perpendicular projection + // of PointVector onto the line is defined as: + // + // PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector) + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1); + XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1); + + XMVECTOR LengthSq = XMVector2LengthSq(LineVector); + + XMVECTOR PointProjectionScale = XMVector2Dot(PointVector, LineVector); + PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq); + + XMVECTOR DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale); + DistanceVector = XMVectorSubtract(PointVector, DistanceVector); + + return XMVector2Length(DistanceVector); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector2IntersectLine +( + FXMVECTOR Line1Point1, + FXMVECTOR Line1Point2, + FXMVECTOR Line2Point1, + GXMVECTOR Line2Point2 +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR V1 = XMVectorSubtract(Line1Point2, Line1Point1); + XMVECTOR V2 = XMVectorSubtract(Line2Point2, Line2Point1); + XMVECTOR V3 = XMVectorSubtract(Line1Point1, Line2Point1); + + XMVECTOR C1 = XMVector2Cross(V1, V2); + XMVECTOR C2 = XMVector2Cross(V2, V3); + + XMVECTOR Result; + const XMVECTOR Zero = XMVectorZero(); + if (XMVector2NearEqual(C1, Zero, g_XMEpsilon.v)) + { + if (XMVector2NearEqual(C2, Zero, g_XMEpsilon.v)) + { + // Coincident + Result = g_XMInfinity.v; + } + else + { + // Parallel + Result = g_XMQNaN.v; + } + } + else + { + // Intersection point = Line1Point1 + V1 * (C2 / C1) + XMVECTOR Scale = XMVectorReciprocal(C1); + Scale = XMVectorMultiply(C2, Scale); + Result = XMVectorMultiplyAdd(V1, Scale, Line1Point1); + } + + return Result; + +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR V1 = _mm_sub_ps(Line1Point2, Line1Point1); + XMVECTOR V2 = _mm_sub_ps(Line2Point2, Line2Point1); + XMVECTOR V3 = _mm_sub_ps(Line1Point1, Line2Point1); + // Generate the cross products + XMVECTOR C1 = XMVector2Cross(V1, V2); + XMVECTOR C2 = XMVector2Cross(V2, V3); + // If C1 is not close to epsilon, use the calculated value + XMVECTOR vResultMask = _mm_setzero_ps(); + vResultMask = _mm_sub_ps(vResultMask,C1); + vResultMask = _mm_max_ps(vResultMask,C1); + // 0xFFFFFFFF if the calculated value is to be used + vResultMask = _mm_cmpgt_ps(vResultMask,g_XMEpsilon); + // If C1 is close to epsilon, which fail type is it? INFINITY or NAN? + XMVECTOR vFailMask = _mm_setzero_ps(); + vFailMask = _mm_sub_ps(vFailMask,C2); + vFailMask = _mm_max_ps(vFailMask,C2); + vFailMask = _mm_cmple_ps(vFailMask,g_XMEpsilon); + XMVECTOR vFail = _mm_and_ps(vFailMask,g_XMInfinity); + vFailMask = _mm_andnot_ps(vFailMask,g_XMQNaN); + // vFail is NAN or INF + vFail = _mm_or_ps(vFail,vFailMask); + // Intersection point = Line1Point1 + V1 * (C2 / C1) + XMVECTOR vResult = _mm_div_ps(C2,C1); + vResult = _mm_mul_ps(vResult,V1); + vResult = _mm_add_ps(vResult,Line1Point1); + // Use result, or failure value + vResult = _mm_and_ps(vResult,vResultMask); + vResultMask = _mm_andnot_ps(vResultMask,vFail); + vResult = _mm_or_ps(vResult,vResultMask); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector2Transform +( + FXMVECTOR V, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 VL = vget_low_f32( V ); + __n128 Y = vdupq_lane_f32( VL, 1 ); + __n128 Result = vmlaq_f32( M.r[3], Y, M.r[1] ); + __n128 X = vdupq_lane_f32( VL, 0 ); + return vmlaq_f32( Result, X, M.r[0] ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + vResult = _mm_mul_ps(vResult,M.r[0]); + XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + vTemp = _mm_mul_ps(vTemp,M.r[1]); + vResult = _mm_add_ps(vResult,vTemp); + vResult = _mm_add_ps(vResult,M.r[3]); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMFLOAT4* XMVector2TransformStream +( + XMFLOAT4* pOutputStream, + size_t OutputStride, + const XMFLOAT2* pInputStream, + size_t InputStride, + size_t VectorCount, + CXMMATRIX M +) +{ + assert(pOutputStream != NULL); + assert(pInputStream != NULL); + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat2((const XMFLOAT2*)pInputVector); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3); + Result = XMVectorMultiplyAdd(X, row0, Result); + + XMStoreFloat4((XMFLOAT4*)pOutputVector, Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector2TransformCoord +( + FXMVECTOR V, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + XMVECTOR W = XMVectorSplatW(Result); + return XMVectorDivide( Result, W ); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMFLOAT2* XMVector2TransformCoordStream +( + XMFLOAT2* pOutputStream, + size_t OutputStride, + const XMFLOAT2* pInputStream, + size_t InputStride, + size_t VectorCount, + CXMMATRIX M +) +{ + assert(pOutputStream != NULL); + assert(pInputStream != NULL); + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row3 = M.r[3]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat2((const XMFLOAT2*)pInputVector); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3); + Result = XMVectorMultiplyAdd(X, row0, Result); + + XMVECTOR W = XMVectorSplatW(Result); + + Result = XMVectorDivide(Result, W); + + XMStoreFloat2((XMFLOAT2*)pOutputVector, Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector2TransformNormal +( + FXMVECTOR V, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiply(Y, M.r[1]); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 VL = vget_low_f32( V ); + __n128 Y = vdupq_lane_f32( VL, 1 ); + __n128 Result = vmulq_f32( Y, M.r[1] ); + __n128 X = vdupq_lane_f32( VL, 0 ); + return vmlaq_f32( Result, X, M.r[0] ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + vResult = _mm_mul_ps(vResult,M.r[0]); + XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + vTemp = _mm_mul_ps(vTemp,M.r[1]); + vResult = _mm_add_ps(vResult,vTemp); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMFLOAT2* XMVector2TransformNormalStream +( + XMFLOAT2* pOutputStream, + size_t OutputStride, + const XMFLOAT2* pInputStream, + size_t InputStride, + size_t VectorCount, + CXMMATRIX M +) +{ + assert(pOutputStream != NULL); + assert(pInputStream != NULL); + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat2((const XMFLOAT2*)pInputVector); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiply(Y, row1); + Result = XMVectorMultiplyAdd(X, row0, Result); + + XMStoreFloat2((XMFLOAT2*)pOutputVector, Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +/**************************************************************************** + * + * 3D Vector + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline bool XMVector3Equal +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vceqq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&7)==7) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAllTrue(XMVector3EqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XMVector3EqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && + (V1.vector4_f32[1] == V2.vector4_f32[1]) && + (V1.vector4_f32[2] == V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && + (V1.vector4_f32[1] != V2.vector4_f32[1]) && + (V1.vector4_f32[2] != V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vceqq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU; + + uint32_t CR = 0; + if ( r == 0xFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); + int iTest = _mm_movemask_ps(vTemp)&7; + uint32_t CR = 0; + if (iTest==7) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline bool XMVector3EqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vceqq_u32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); + return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&7)==7) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAllTrue(XMVector3EqualIntR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XMVector3EqualIntR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if ((V1.vector4_u32[0] == V2.vector4_u32[0]) && + (V1.vector4_u32[1] == V2.vector4_u32[1]) && + (V1.vector4_u32[2] == V2.vector4_u32[2])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) && + (V1.vector4_u32[1] != V2.vector4_u32[1]) && + (V1.vector4_u32[2] != V2.vector4_u32[2])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vceqq_u32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU; + + uint32_t CR = 0; + if ( r == 0xFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); + int iTemp = _mm_movemask_ps(_mm_castsi128_ps(vTemp))&7; + uint32_t CR = 0; + if (iTemp==7) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTemp) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline bool XMVector3NearEqual +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Epsilon +) +{ +#if defined(_XM_NO_INTRINSICS_) + float dx, dy, dz; + + dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]); + dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]); + dz = fabsf(V1.vector4_f32[2]-V2.vector4_f32[2]); + return (((dx <= Epsilon.vector4_f32[0]) && + (dy <= Epsilon.vector4_f32[1]) && + (dz <= Epsilon.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vDelta = vsubq_f32( V1, V2 ); + __n128 vResult = vacleq_f32( vDelta, Epsilon ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + // Get the difference + XMVECTOR vDelta = _mm_sub_ps(V1,V2); + // Get the absolute value of the difference + XMVECTOR vTemp = _mm_setzero_ps(); + vTemp = _mm_sub_ps(vTemp,vDelta); + vTemp = _mm_max_ps(vTemp,vDelta); + vTemp = _mm_cmple_ps(vTemp,Epsilon); + // w is don't care + return (((_mm_movemask_ps(vTemp)&7)==0x7) != 0); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline bool XMVector3NotEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vceqq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&7)!=7) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAnyFalse(XMVector3EqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XMVector3NotEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vceqq_u32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); + return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&7)!=7) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAnyFalse(XMVector3EqualIntR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XMVector3Greater +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vcgtq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&7)==7) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAllTrue(XMVector3GreaterR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XMVector3GreaterR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if ((V1.vector4_f32[0] > V2.vector4_f32[0]) && + (V1.vector4_f32[1] > V2.vector4_f32[1]) && + (V1.vector4_f32[2] > V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) && + (V1.vector4_f32[1] <= V2.vector4_f32[1]) && + (V1.vector4_f32[2] <= V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vcgtq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU; + + uint32_t CR = 0; + if ( r == 0xFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); + uint32_t CR = 0; + int iTest = _mm_movemask_ps(vTemp)&7; + if (iTest==7) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline bool XMVector3GreaterOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vcgeq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&7)==7) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAllTrue(XMVector3GreaterOrEqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XMVector3GreaterOrEqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && + (V1.vector4_f32[1] >= V2.vector4_f32[1]) && + (V1.vector4_f32[2] >= V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && + (V1.vector4_f32[1] < V2.vector4_f32[1]) && + (V1.vector4_f32[2] < V2.vector4_f32[2])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vcgeq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU; + + uint32_t CR = 0; + if ( r == 0xFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); + uint32_t CR = 0; + int iTest = _mm_movemask_ps(vTemp)&7; + if (iTest==7) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline bool XMVector3Less +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vcltq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmplt_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&7)==7) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAllTrue(XMVector3GreaterR(V2, V1)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XMVector3LessOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vcleq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmple_ps(V1,V2); + return (((_mm_movemask_ps(vTemp)&7)==7) != 0); +#else // _XM_VMX128_INTRINSICS_ + return XMComparisonAllTrue(XMVector3GreaterOrEqualR(V2, V1)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XMVector3InBounds +( + FXMVECTOR V, + FXMVECTOR Bounds +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && + (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) && + (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test if less than or equal + __n128 vTemp1 = vcleq_f32(V,Bounds); + // Negate the bounds + __n128 vTemp2 = vnegq_f32(Bounds); + // Test if greater or equal (Reversed) + vTemp2 = vcleq_f32(vTemp2,V); + // Blend answers + vTemp1 = vandq_u32(vTemp1,vTemp2); + // in bounds? + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2,V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1,vTemp2); + // x,y and z in bounds? (w is don't care) + return (((_mm_movemask_ps(vTemp1)&0x7)==0x7) != 0); +#else + return XMComparisonAllInBounds(XMVector3InBoundsR(V, Bounds)); +#endif +} + + +//------------------------------------------------------------------------------ + +inline bool XMVector3IsNaN +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + return (XMISNAN(V.vector4_f32[0]) || + XMISNAN(V.vector4_f32[1]) || + XMISNAN(V.vector4_f32[2])); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test against itself. NaN is always not equal + __n128 vTempNan = vceqq_f32( V, V ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempNan), vget_high_u8(vTempNan)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + // If x or y or z are NaN, the mask is zero + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0xFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + // Test against itself. NaN is always not equal + XMVECTOR vTempNan = _mm_cmpneq_ps(V,V); + // If x or y or z are NaN, the mask is non-zero + return ((_mm_movemask_ps(vTempNan)&7) != 0); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline bool XMVector3IsInfinite +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (XMISINF(V.vector4_f32[0]) || + XMISINF(V.vector4_f32[1]) || + XMISINF(V.vector4_f32[2])); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Mask off the sign bit + __n128 vTempInf = vandq_u32( V, g_XMAbsMask ); + // Compare to infinity + vTempInf = vceqq_f32(vTempInf, g_XMInfinity ); + // If any are infinity, the signs are true. + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempInf), vget_high_u8(vTempInf)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bit + __m128 vTemp = _mm_and_ps(V,g_XMAbsMask); + // Compare to infinity + vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity); + // If x,y or z are infinity, the signs are true. + return ((_mm_movemask_ps(vTemp)&7) != 0); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector3Dot +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + float fValue = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2]; + XMVECTOR vResult = { + fValue, + fValue, + fValue, + fValue + }; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vTemp = vmulq_f32( V1, V2 ); + __n64 v1 = vget_low_f32( vTemp ); + __n64 v2 = vget_high_f32( vTemp ); + v1 = vpadd_f32( v1, v1 ); + v2 = vdup_lane_f32( v2, 0 ); + v1 = vadd_f32( v1, v2 ); + return vcombine_f32( v1, v1 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product + XMVECTOR vDot = _mm_mul_ps(V1,V2); + // x=Dot.vector4_f32[1], y=Dot.vector4_f32[2] + XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1)); + // Result.vector4_f32[0] = x+y + vDot = _mm_add_ss(vDot,vTemp); + // x=Dot.vector4_f32[2] + vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); + // Result.vector4_f32[0] = (x+y)+z + vDot = _mm_add_ss(vDot,vTemp); + // Splat x + return XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector3Cross +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + // [ V1.y*V2.z - V1.z*V2.y, V1.z*V2.x - V1.x*V2.z, V1.x*V2.y - V1.y*V2.x ] + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR vResult = { + (V1.vector4_f32[1] * V2.vector4_f32[2]) - (V1.vector4_f32[2] * V2.vector4_f32[1]), + (V1.vector4_f32[2] * V2.vector4_f32[0]) - (V1.vector4_f32[0] * V2.vector4_f32[2]), + (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]), + 0.0f + }; + return vResult; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 v1xy = vget_low_f32(V1); + __n64 v2xy = vget_low_f32(V2); + + __n64 v1yx = vrev64_f32( v1xy ); + __n64 v2yx = vrev64_f32( v2xy ); + + __n64 v1zz = vdup_lane_f32( vget_high_f32(V1), 0 ); + __n64 v2zz = vdup_lane_f32( vget_high_f32(V2), 0 ); + + __n128 vResult = vmulq_f32( vcombine_f32(v1yx,v1xy), vcombine_f32(v2zz,v2yx) ); + vResult = vmlsq_f32( vResult, vcombine_f32(v1zz,v1yx), vcombine_f32(v2yx,v2xy) ); + return veorq_u32( vResult, g_XMFlipY ); +#elif defined(_XM_SSE_INTRINSICS_) + // y1,z1,x1,w1 + XMVECTOR vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(3,0,2,1)); + // z2,x2,y2,w2 + XMVECTOR vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(3,1,0,2)); + // Perform the left operation + XMVECTOR vResult = _mm_mul_ps(vTemp1,vTemp2); + // z1,x1,y1,w1 + vTemp1 = XM_PERMUTE_PS(vTemp1,_MM_SHUFFLE(3,0,2,1)); + // y2,z2,x2,w2 + vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(3,1,0,2)); + // Perform the right operation + vTemp1 = _mm_mul_ps(vTemp1,vTemp2); + // Subract the right from left, and return answer + vResult = _mm_sub_ps(vResult,vTemp1); + // Set w to zero + return _mm_and_ps(vResult,g_XMMask3); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector3LengthSq +( + FXMVECTOR V +) +{ + return XMVector3Dot(V, V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector3ReciprocalLengthEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector3LengthSq(V); + Result = XMVectorReciprocalSqrtEst(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + __n128 vTemp = vmulq_f32( V, V ); + __n64 v1 = vget_low_f32( vTemp ); + __n64 v2 = vget_high_f32( vTemp ); + v1 = vpadd_f32( v1, v1 ); + v2 = vdup_lane_f32( v2, 0 ); + v1 = vadd_f32( v1, v2 ); + // Reciprocal sqrt (estimate) + v2 = vrsqrte_f32( v1 ); + return vcombine_f32(v2, v2); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and y + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,2,1,2)); + // x+z, y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + // y,y,y,y + vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); + // x+z+y,??,??,?? + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + // Splat the length squared + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + // Get the reciprocal + vLengthSq = _mm_rsqrt_ps(vLengthSq); + return vLengthSq; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector3ReciprocalLength +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector3LengthSq(V); + Result = XMVectorReciprocalSqrt(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + __n128 vTemp = vmulq_f32( V, V ); + __n64 v1 = vget_low_f32( vTemp ); + __n64 v2 = vget_high_f32( vTemp ); + v1 = vpadd_f32( v1, v1 ); + v2 = vdup_lane_f32( v2, 0 ); + v1 = vadd_f32( v1, v2 ); + // Reciprocal sqrt + __n64 S0 = vrsqrte_f32(v1); + __n64 P0 = vmul_f32( v1, S0 ); + __n64 R0 = vrsqrts_f32( P0, S0 ); + __n64 S1 = vmul_f32( S0, R0 ); + __n64 P1 = vmul_f32( v1, S1 ); + __n64 R1 = vrsqrts_f32( P1, S1 ); + __n64 Result = vmul_f32( S1, R1 ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product + XMVECTOR vDot = _mm_mul_ps(V,V); + // x=Dot.y, y=Dot.z + XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1)); + // Result.x = x+y + vDot = _mm_add_ss(vDot,vTemp); + // x=Dot.z + vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); + // Result.x = (x+y)+z + vDot = _mm_add_ss(vDot,vTemp); + // Splat x + vDot = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0)); + // Get the reciprocal + vDot = _mm_sqrt_ps(vDot); + // Get the reciprocal + vDot = _mm_div_ps(g_XMOne,vDot); + return vDot; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector3LengthEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector3LengthSq(V); + Result = XMVectorSqrtEst(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + __n128 vTemp = vmulq_f32( V, V ); + __n64 v1 = vget_low_f32( vTemp ); + __n64 v2 = vget_high_f32( vTemp ); + v1 = vpadd_f32( v1, v1 ); + v2 = vdup_lane_f32( v2, 0 ); + v1 = vadd_f32( v1, v2 ); + const __n64 zero = vdup_n_u32(0); + __n64 VEqualsZero = vceq_f32( v1, zero ); + // Sqrt (estimate) + __n64 Result = vrsqrte_f32( v1 ); + Result = vmul_f32( v1, Result ); + Result = vbsl_f32( VEqualsZero, zero, Result ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and y + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,2,1,2)); + // x+z, y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + // y,y,y,y + vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); + // x+z+y,??,??,?? + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + // Splat the length squared + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + // Get the length + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector3Length +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector3LengthSq(V); + Result = XMVectorSqrt(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + __n128 vTemp = vmulq_f32( V, V ); + __n64 v1 = vget_low_f32( vTemp ); + __n64 v2 = vget_high_f32( vTemp ); + v1 = vpadd_f32( v1, v1 ); + v2 = vdup_lane_f32( v2, 0 ); + v1 = vadd_f32( v1, v2 ); + const __n64 zero = vdup_n_u32(0); + __n64 VEqualsZero = vceq_f32( v1, zero ); + // Sqrt + __n64 S0 = vrsqrte_f32( v1 ); + __n64 P0 = vmul_f32( v1, S0 ); + __n64 R0 = vrsqrts_f32( P0, S0 ); + __n64 S1 = vmul_f32( S0, R0 ); + __n64 P1 = vmul_f32( v1, S1 ); + __n64 R1 = vrsqrts_f32( P1, S1 ); + __n64 Result = vmul_f32( S1, R1 ); + Result = vmul_f32( v1, Result ); + Result = vbsl_f32( VEqualsZero, zero, Result ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and y + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,2,1,2)); + // x+z, y + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + // y,y,y,y + vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); + // x+z+y,??,??,?? + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + // Splat the length squared + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + // Get the length + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// XMVector3NormalizeEst uses a reciprocal estimate and +// returns QNaN on zero and infinite vectors. + +inline XMVECTOR XMVector3NormalizeEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector3ReciprocalLength(V); + Result = XMVectorMultiply(V, Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + __n128 vTemp = vmulq_f32( V, V ); + __n64 v1 = vget_low_f32( vTemp ); + __n64 v2 = vget_high_f32( vTemp ); + v1 = vpadd_f32( v1, v1 ); + v2 = vdup_lane_f32( v2, 0 ); + v1 = vadd_f32( v1, v2 ); + // Reciprocal sqrt (estimate) + v2 = vrsqrte_f32( v1 ); + // Normalize + return vmulq_f32( V, vcombine_f32(v2,v2) ); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product + XMVECTOR vDot = _mm_mul_ps(V,V); + // x=Dot.y, y=Dot.z + XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1)); + // Result.x = x+y + vDot = _mm_add_ss(vDot,vTemp); + // x=Dot.z + vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); + // Result.x = (x+y)+z + vDot = _mm_add_ss(vDot,vTemp); + // Splat x + vDot = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0)); + // Get the reciprocal + vDot = _mm_rsqrt_ps(vDot); + // Perform the normalization + vDot = _mm_mul_ps(vDot,V); + return vDot; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector3Normalize +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + float fLength; + XMVECTOR vResult; + + vResult = XMVector3Length( V ); + fLength = vResult.vector4_f32[0]; + + // Prevent divide by zero + if (fLength > 0) { + fLength = 1.0f/fLength; + } + + vResult.vector4_f32[0] = V.vector4_f32[0]*fLength; + vResult.vector4_f32[1] = V.vector4_f32[1]*fLength; + vResult.vector4_f32[2] = V.vector4_f32[2]*fLength; + vResult.vector4_f32[3] = V.vector4_f32[3]*fLength; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot3 + __n128 vTemp = vmulq_f32( V, V ); + __n64 v1 = vget_low_f32( vTemp ); + __n64 v2 = vget_high_f32( vTemp ); + v1 = vpadd_f32( v1, v1 ); + v2 = vdup_lane_f32( v2, 0 ); + v1 = vadd_f32( v1, v2 ); + __n64 VEqualsZero = vceq_f32( v1, vdup_n_u32(0) ); + __n64 VEqualsInf = vceq_f32( v1, vget_low_f32(g_XMInfinity) ); + // Reciprocal sqrt (2 iterations of Newton-Raphson) + __n64 S0 = vrsqrte_f32( v1 ); + __n64 P0 = vmul_f32( v1, S0 ); + __n64 R0 = vrsqrts_f32( P0, S0 ); + __n64 S1 = vmul_f32( S0, R0 ); + __n64 P1 = vmul_f32( v1, S1 ); + __n64 R1 = vrsqrts_f32( P1, S1 ); + v2 = vmul_f32( S1, R1 ); + // Normalize + __n128 vResult = vmulq_f32( V, vcombine_f32(v2,v2) ); + vResult = vbslq_f32( vcombine_f32(VEqualsZero,VEqualsZero), vdupq_n_f32(0), vResult ); + return vbslq_f32( vcombine_f32(VEqualsInf,VEqualsInf), g_XMQNaN, vResult ); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y and z only + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,1,2,1)); + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); + vLengthSq = _mm_add_ss(vLengthSq,vTemp); + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); + vResult = _mm_or_ps(vTemp1,vTemp2); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector3ClampLength +( + FXMVECTOR V, + float LengthMin, + float LengthMax +) +{ + XMVECTOR ClampMax = XMVectorReplicate(LengthMax); + XMVECTOR ClampMin = XMVectorReplicate(LengthMin); + + return XMVector3ClampLengthV(V, ClampMin, ClampMax); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector3ClampLengthV +( + FXMVECTOR V, + FXMVECTOR LengthMin, + FXMVECTOR LengthMax +) +{ + assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin))); + assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax))); + assert(XMVector3GreaterOrEqual(LengthMin, XMVectorZero())); + assert(XMVector3GreaterOrEqual(LengthMax, XMVectorZero())); + assert(XMVector3GreaterOrEqual(LengthMax, LengthMin)); + + XMVECTOR LengthSq = XMVector3LengthSq(V); + + const XMVECTOR Zero = XMVectorZero(); + + XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq); + + XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v); + XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero); + + XMVECTOR Normal = XMVectorMultiply(V, RcpLength); + + XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength); + + XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength); + Length = XMVectorSelect(LengthSq, Length, Select); + Normal = XMVectorSelect(LengthSq, Normal, Select); + + XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax); + XMVECTOR ControlMin = XMVectorLess(Length, LengthMin); + + XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax); + ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin); + + XMVECTOR Result = XMVectorMultiply(Normal, ClampLength); + + // Preserve the original vector (with no precision loss) if the length falls within the given range + XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin); + Result = XMVectorSelect(Result, V, Control); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector3Reflect +( + FXMVECTOR Incident, + FXMVECTOR Normal +) +{ + // Result = Incident - (2 * dot(Incident, Normal)) * Normal + + XMVECTOR Result = XMVector3Dot(Incident, Normal); + Result = XMVectorAdd(Result, Result); + Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector3Refract +( + FXMVECTOR Incident, + FXMVECTOR Normal, + float RefractionIndex +) +{ + XMVECTOR Index = XMVectorReplicate(RefractionIndex); + return XMVector3RefractV(Incident, Normal, Index); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector3RefractV +( + FXMVECTOR Incident, + FXMVECTOR Normal, + FXMVECTOR RefractionIndex +) +{ + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + +#if defined(_XM_NO_INTRINSICS_) + + const XMVECTOR Zero = XMVectorZero(); + + XMVECTOR IDotN = XMVector3Dot(Incident, Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + XMVECTOR R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v); + R = XMVectorMultiply(R, RefractionIndex); + R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v); + + if (XMVector4LessOrEqual(R, Zero)) + { + // Total internal reflection + return Zero; + } + else + { + // R = RefractionIndex * IDotN + sqrt(R) + R = XMVectorSqrt(R); + R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R); + + // Result = RefractionIndex * Incident - Normal * R + XMVECTOR Result = XMVectorMultiply(RefractionIndex, Incident); + Result = XMVectorNegativeMultiplySubtract(Normal, R, Result); + + return Result; + } + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR IDotN = XMVector3Dot(Incident,Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + __n128 R = vmlsq_f32( g_XMOne, IDotN, IDotN); + R = vmulq_f32(R, RefractionIndex); + R = vmlsq_f32(g_XMOne, R, RefractionIndex ); + + __n128 vResult = vcleq_f32(R,g_XMZero); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + if ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ) + { + // Total internal reflection + vResult = g_XMZero; + } + else + { + // Sqrt(R) + __n128 S0 = vrsqrteq_f32(R); + __n128 P0 = vmulq_f32( R, S0 ); + __n128 R0 = vrsqrtsq_f32( P0, S0 ); + __n128 S1 = vmulq_f32( S0, R0 ); + __n128 P1 = vmulq_f32( R, S1 ); + __n128 R1 = vrsqrtsq_f32( P1, S1 ); + __n128 S2 = vmulq_f32( S1, R1 ); + R = vmulq_f32( R, S2 ); + // R = RefractionIndex * IDotN + sqrt(R) + R = vmlaq_f32( R, RefractionIndex, IDotN ); + // Result = RefractionIndex * Incident - Normal * R + vResult = vmulq_f32(RefractionIndex, Incident); + vResult = vmlsq_f32( vResult, R, Normal ); + } + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + XMVECTOR IDotN = XMVector3Dot(Incident, Normal); + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + XMVECTOR R = _mm_mul_ps(IDotN, IDotN); + R = _mm_sub_ps(g_XMOne,R); + R = _mm_mul_ps(R, RefractionIndex); + R = _mm_mul_ps(R, RefractionIndex); + R = _mm_sub_ps(g_XMOne,R); + + XMVECTOR vResult = _mm_cmple_ps(R,g_XMZero); + if (_mm_movemask_ps(vResult)==0x0f) + { + // Total internal reflection + vResult = g_XMZero; + } + else + { + // R = RefractionIndex * IDotN + sqrt(R) + R = _mm_sqrt_ps(R); + vResult = _mm_mul_ps(RefractionIndex,IDotN); + R = _mm_add_ps(R,vResult); + // Result = RefractionIndex * Incident - Normal * R + vResult = _mm_mul_ps(RefractionIndex, Incident); + R = _mm_mul_ps(R,Normal); + vResult = _mm_sub_ps(vResult,R); + } + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector3Orthogonal +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR Zero = XMVectorZero(); + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR YZYY = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(V); + + XMVECTOR NegativeV = XMVectorSubtract(Zero, V); + + XMVECTOR ZIsNegative = XMVectorLess(Z, Zero); + XMVECTOR YZYYIsNegative = XMVectorLess(YZYY, Zero); + + XMVECTOR S = XMVectorAdd(YZYY, Z); + XMVECTOR D = XMVectorSubtract(YZYY, Z); + + XMVECTOR Select = XMVectorEqualInt(ZIsNegative, YZYYIsNegative); + + XMVECTOR R0 = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X>(NegativeV, S); + XMVECTOR R1 = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X>(V, D); + + return XMVectorSelect(R1, R0, Select); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector3AngleBetweenNormalsEst +( + FXMVECTOR N1, + FXMVECTOR N2 +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR Result = XMVector3Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); + Result = XMVectorACosEst(Result); + return Result; + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector3AngleBetweenNormals +( + FXMVECTOR N1, + FXMVECTOR N2 +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR Result = XMVector3Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); + Result = XMVectorACos(Result); + return Result; + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector3AngleBetweenVectors +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR L1 = XMVector3ReciprocalLength(V1); + XMVECTOR L2 = XMVector3ReciprocalLength(V2); + + XMVECTOR Dot = XMVector3Dot(V1, V2); + + L1 = XMVectorMultiply(L1, L2); + + XMVECTOR CosAngle = XMVectorMultiply(Dot, L1); + CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v); + + return XMVectorACos(CosAngle); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector3LinePointDistance +( + FXMVECTOR LinePoint1, + FXMVECTOR LinePoint2, + FXMVECTOR Point +) +{ + // Given a vector PointVector from LinePoint1 to Point and a vector + // LineVector from LinePoint1 to LinePoint2, the scaled distance + // PointProjectionScale from LinePoint1 to the perpendicular projection + // of PointVector onto the line is defined as: + // + // PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector) + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1); + XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1); + + XMVECTOR LengthSq = XMVector3LengthSq(LineVector); + + XMVECTOR PointProjectionScale = XMVector3Dot(PointVector, LineVector); + PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq); + + XMVECTOR DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale); + DistanceVector = XMVectorSubtract(PointVector, DistanceVector); + + return XMVector3Length(DistanceVector); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline void XMVector3ComponentsFromNormal +( + XMVECTOR* pParallel, + XMVECTOR* pPerpendicular, + FXMVECTOR V, + FXMVECTOR Normal +) +{ + assert(pParallel != NULL); + assert(pPerpendicular != NULL); + + XMVECTOR Scale = XMVector3Dot(V, Normal); + + XMVECTOR Parallel = XMVectorMultiply(Normal, Scale); + + *pParallel = Parallel; + *pPerpendicular = XMVectorSubtract(V, Parallel); +} + +//------------------------------------------------------------------------------ +// Transform a vector using a rotation expressed as a unit quaternion + +inline XMVECTOR XMVector3Rotate +( + FXMVECTOR V, + FXMVECTOR RotationQuaternion +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v); + XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion); + XMVECTOR Result = XMQuaternionMultiply(Q, A); + return XMQuaternionMultiply(Result, RotationQuaternion); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Transform a vector using the inverse of a rotation expressed as a unit quaternion + +inline XMVECTOR XMVector3InverseRotate +( + FXMVECTOR V, + FXMVECTOR RotationQuaternion +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v); + XMVECTOR Result = XMQuaternionMultiply(RotationQuaternion, A); + XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion); + return XMQuaternionMultiply(Result, Q); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector3Transform +( + FXMVECTOR V, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]); + Result = XMVectorMultiplyAdd(Y, M.r[1], Result); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 VL = vget_low_f32( V ); + XMVECTOR vResult = vdupq_lane_f32( VL, 0 ); // X + XMVECTOR vTemp = vdupq_lane_f32( VL, 1 ); // Y + vResult = vmlaq_f32( M.r[3], vResult, M.r[0] ); + vResult = vmlaq_f32( vResult, vTemp, M.r[1] ); + vTemp = vdupq_lane_f32( vget_high_f32( V ), 0 ); // Z + return vmlaq_f32( vResult, vTemp, M.r[2] ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + vResult = _mm_mul_ps(vResult,M.r[0]); + XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + vTemp = _mm_mul_ps(vTemp,M.r[1]); + vResult = _mm_add_ps(vResult,vTemp); + vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + vTemp = _mm_mul_ps(vTemp,M.r[2]); + vResult = _mm_add_ps(vResult,vTemp); + vResult = _mm_add_ps(vResult,M.r[3]); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMFLOAT4* XMVector3TransformStream +( + XMFLOAT4* pOutputStream, + size_t OutputStride, + const XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + CXMMATRIX M +) +{ + assert(pOutputStream != NULL); + assert(pInputStream != NULL); + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector); + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3); + Result = XMVectorMultiplyAdd(Y, row1, Result); + Result = XMVectorMultiplyAdd(X, row0, Result); + + XMStoreFloat4((XMFLOAT4*)pOutputVector, Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector3TransformCoord +( + FXMVECTOR V, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]); + Result = XMVectorMultiplyAdd(Y, M.r[1], Result); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + XMVECTOR W = XMVectorSplatW(Result); + return XMVectorDivide( Result, W ); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMFLOAT3* XMVector3TransformCoordStream +( + XMFLOAT3* pOutputStream, + size_t OutputStride, + const XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + CXMMATRIX M +) +{ + assert(pOutputStream != NULL); + assert(pInputStream != NULL); + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector); + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3); + Result = XMVectorMultiplyAdd(Y, row1, Result); + Result = XMVectorMultiplyAdd(X, row0, Result); + + XMVECTOR W = XMVectorSplatW(Result); + + Result = XMVectorDivide(Result, W); + + XMStoreFloat3((XMFLOAT3*)pOutputVector, Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector3TransformNormal +( + FXMVECTOR V, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiply(Z, M.r[2]); + Result = XMVectorMultiplyAdd(Y, M.r[1], Result); + Result = XMVectorMultiplyAdd(X, M.r[0], Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 VL = vget_low_f32( V ); + XMVECTOR vResult = vdupq_lane_f32( VL, 0 ); // X + XMVECTOR vTemp = vdupq_lane_f32( VL, 1 ); // Y + vResult = vmulq_f32( vResult, M.r[0] ); + vResult = vmlaq_f32( vResult, vTemp, M.r[1] ); + vTemp = vdupq_lane_f32( vget_high_f32( V ), 0 ); // Z + return vmlaq_f32( vResult, vTemp, M.r[2] ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + vResult = _mm_mul_ps(vResult,M.r[0]); + XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + vTemp = _mm_mul_ps(vTemp,M.r[1]); + vResult = _mm_add_ps(vResult,vTemp); + vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + vTemp = _mm_mul_ps(vTemp,M.r[2]); + vResult = _mm_add_ps(vResult,vTemp); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMFLOAT3* XMVector3TransformNormalStream +( + XMFLOAT3* pOutputStream, + size_t OutputStride, + const XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + CXMMATRIX M +) +{ + assert(pOutputStream != NULL); + assert(pInputStream != NULL); + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector); + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiply(Z, row2); + Result = XMVectorMultiplyAdd(Y, row1, Result); + Result = XMVectorMultiplyAdd(X, row0, Result); + + XMStoreFloat3((XMFLOAT3*)pOutputVector, Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector3Project +( + FXMVECTOR V, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + CXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + const float HalfViewportWidth = ViewportWidth * 0.5f; + const float HalfViewportHeight = ViewportHeight * 0.5f; + + XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f); + XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + + XMVECTOR Result = XMVector3TransformCoord(V, Transform); + + Result = XMVectorMultiplyAdd(Result, Scale, Offset); + + return Result; + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMFLOAT3* XMVector3ProjectStream +( + XMFLOAT3* pOutputStream, + size_t OutputStride, + const XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + CXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) +{ + assert(pOutputStream != NULL); + assert(pInputStream != NULL); + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) + + const float HalfViewportWidth = ViewportWidth * 0.5f; + const float HalfViewportHeight = ViewportHeight * 0.5f; + + XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f); + XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector); + + XMVECTOR Result = XMVector3TransformCoord(V, Transform); + Result = XMVectorMultiplyAdd(Result, Scale, Offset); + + XMStoreFloat3((XMFLOAT3*)pOutputVector, Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector3Unproject +( + FXMVECTOR V, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + CXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f }; + + XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); + Scale = XMVectorReciprocal(Scale); + + XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); + Offset = XMVectorMultiplyAdd(Scale, Offset, D.v); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(NULL, Transform); + + XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset); + + return XMVector3TransformCoord(Result, Transform); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline XMFLOAT3* XMVector3UnprojectStream +( + XMFLOAT3* pOutputStream, + size_t OutputStride, + const XMFLOAT3* pInputStream, + size_t InputStride, + size_t VectorCount, + float ViewportX, + float ViewportY, + float ViewportWidth, + float ViewportHeight, + float ViewportMinZ, + float ViewportMaxZ, + CXMMATRIX Projection, + CXMMATRIX View, + CXMMATRIX World) +{ + assert(pOutputStream != NULL); + assert(pInputStream != NULL); + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f }; + + XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); + Scale = XMVectorReciprocal(Scale); + + XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); + Offset = XMVectorMultiplyAdd(Scale, Offset, D.v); + + XMMATRIX Transform = XMMatrixMultiply(World, View); + Transform = XMMatrixMultiply(Transform, Projection); + Transform = XMMatrixInverse(NULL, Transform); + + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector); + + XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset); + + Result = XMVector3TransformCoord(Result, Transform); + + XMStoreFloat3((XMFLOAT3*)pOutputVector, Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +/**************************************************************************** + * + * 4D Vector + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +// Comparison operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline bool XMVector4Equal +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2]) && (V1.vector4_f32[3] == V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vceqq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); + return ((_mm_movemask_ps(vTemp)==0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4EqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XMVector4EqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + uint32_t CR = 0; + + if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && + (V1.vector4_f32[1] == V2.vector4_f32[1]) && + (V1.vector4_f32[2] == V2.vector4_f32[2]) && + (V1.vector4_f32[3] == V2.vector4_f32[3])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && + (V1.vector4_f32[1] != V2.vector4_f32[1]) && + (V1.vector4_f32[2] != V2.vector4_f32[2]) && + (V1.vector4_f32[3] != V2.vector4_f32[3])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vceqq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + + uint32_t CR = 0; + if ( r == 0xFFFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); + int iTest = _mm_movemask_ps(vTemp); + uint32_t CR = 0; + if (iTest==0xf) // All equal? + { + CR = XM_CRMASK_CR6TRUE; + } + else if (iTest==0) // All not equal? + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline bool XMVector4EqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2]) && (V1.vector4_u32[3] == V2.vector4_u32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vceqq_u32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); + return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp))==0xf) != 0); +#else + return XMComparisonAllTrue(XMVector4EqualIntR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XMVector4EqualIntR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if (V1.vector4_u32[0] == V2.vector4_u32[0] && + V1.vector4_u32[1] == V2.vector4_u32[1] && + V1.vector4_u32[2] == V2.vector4_u32[2] && + V1.vector4_u32[3] == V2.vector4_u32[3]) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (V1.vector4_u32[0] != V2.vector4_u32[0] && + V1.vector4_u32[1] != V2.vector4_u32[1] && + V1.vector4_u32[2] != V2.vector4_u32[2] && + V1.vector4_u32[3] != V2.vector4_u32[3]) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vceqq_u32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + + uint32_t CR = 0; + if ( r == 0xFFFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); + int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp)); + uint32_t CR = 0; + if (iTest==0xf) // All equal? + { + CR = XM_CRMASK_CR6TRUE; + } + else if (iTest==0) // All not equal? + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +inline bool XMVector4NearEqual +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR Epsilon +) +{ +#if defined(_XM_NO_INTRINSICS_) + float dx, dy, dz, dw; + + dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]); + dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]); + dz = fabsf(V1.vector4_f32[2]-V2.vector4_f32[2]); + dw = fabsf(V1.vector4_f32[3]-V2.vector4_f32[3]); + return (((dx <= Epsilon.vector4_f32[0]) && + (dy <= Epsilon.vector4_f32[1]) && + (dz <= Epsilon.vector4_f32[2]) && + (dw <= Epsilon.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vDelta = vsubq_f32( V1, V2 ); + __n128 vResult = vacleq_f32( vDelta, Epsilon ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + // Get the difference + XMVECTOR vDelta = _mm_sub_ps(V1,V2); + // Get the absolute value of the difference + XMVECTOR vTemp = _mm_setzero_ps(); + vTemp = _mm_sub_ps(vTemp,vDelta); + vTemp = _mm_max_ps(vTemp,vDelta); + vTemp = _mm_cmple_ps(vTemp,Epsilon); + return ((_mm_movemask_ps(vTemp)==0xf) != 0); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline bool XMVector4NotEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2]) || (V1.vector4_f32[3] != V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vceqq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) != 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpneq_ps(V1,V2); + return ((_mm_movemask_ps(vTemp)) != 0); +#else + return XMComparisonAnyFalse(XMVector4EqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XMVector4NotEqualInt +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2]) || (V1.vector4_u32[3] != V2.vector4_u32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vceqq_u32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) != 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); + return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp))!=0xF) != 0); +#else + return XMComparisonAnyFalse(XMVector4EqualIntR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XMVector4Greater +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2]) && (V1.vector4_f32[3] > V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vcgtq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); + return ((_mm_movemask_ps(vTemp)==0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4GreaterR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XMVector4GreaterR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if (V1.vector4_f32[0] > V2.vector4_f32[0] && + V1.vector4_f32[1] > V2.vector4_f32[1] && + V1.vector4_f32[2] > V2.vector4_f32[2] && + V1.vector4_f32[3] > V2.vector4_f32[3]) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (V1.vector4_f32[0] <= V2.vector4_f32[0] && + V1.vector4_f32[1] <= V2.vector4_f32[1] && + V1.vector4_f32[2] <= V2.vector4_f32[2] && + V1.vector4_f32[3] <= V2.vector4_f32[3]) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vcgtq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + + uint32_t CR = 0; + if ( r == 0xFFFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + uint32_t CR = 0; + XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); + int iTest = _mm_movemask_ps(vTemp); + if (iTest==0xf) { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline bool XMVector4GreaterOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2]) && (V1.vector4_f32[3] >= V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vcgeq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); + return ((_mm_movemask_ps(vTemp)==0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V1, V2)); +#endif +} + +//------------------------------------------------------------------------------ + +inline uint32_t XMVector4GreaterOrEqualR +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + uint32_t CR = 0; + if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && + (V1.vector4_f32[1] >= V2.vector4_f32[1]) && + (V1.vector4_f32[2] >= V2.vector4_f32[2]) && + (V1.vector4_f32[3] >= V2.vector4_f32[3])) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && + (V1.vector4_f32[1] < V2.vector4_f32[1]) && + (V1.vector4_f32[2] < V2.vector4_f32[2]) && + (V1.vector4_f32[3] < V2.vector4_f32[3])) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vcgeq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + uint32_t r = vget_lane_u32(vTemp.val[1], 1); + + uint32_t CR = 0; + if ( r == 0xFFFFFFFFU ) + { + CR = XM_CRMASK_CR6TRUE; + } + else if ( !r ) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#elif defined(_XM_SSE_INTRINSICS_) + uint32_t CR = 0; + XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); + int iTest = _mm_movemask_ps(vTemp); + if (iTest==0x0f) + { + CR = XM_CRMASK_CR6TRUE; + } + else if (!iTest) + { + CR = XM_CRMASK_CR6FALSE; + } + return CR; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline bool XMVector4Less +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2]) && (V1.vector4_f32[3] < V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vcltq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmplt_ps(V1,V2); + return ((_mm_movemask_ps(vTemp)==0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4GreaterR(V2, V1)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XMVector4LessOrEqual +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2]) && (V1.vector4_f32[3] <= V2.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vcleq_f32( V1, V2 ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp = _mm_cmple_ps(V1,V2); + return ((_mm_movemask_ps(vTemp)==0x0f) != 0); +#else + return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V2, V1)); +#endif +} + +//------------------------------------------------------------------------------ + +inline bool XMVector4InBounds +( + FXMVECTOR V, + FXMVECTOR Bounds +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && + (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) && + (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) && + (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3])) != 0); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test if less than or equal + __n128 vTemp1 = vcleq_f32(V,Bounds); + // Negate the bounds + __n128 vTemp2 = vnegq_f32(Bounds); + // Test if greater or equal (Reversed) + vTemp2 = vcleq_f32(vTemp2,V); + // Blend answers + vTemp1 = vandq_u32(vTemp1,vTemp2); + // in bounds? + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + // Test if less than or equal + XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); + // Negate the bounds + XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); + // Test if greater or equal (Reversed) + vTemp2 = _mm_cmple_ps(vTemp2,V); + // Blend answers + vTemp1 = _mm_and_ps(vTemp1,vTemp2); + // All in bounds? + return ((_mm_movemask_ps(vTemp1)==0x0f) != 0); +#else + return XMComparisonAllInBounds(XMVector4InBoundsR(V, Bounds)); +#endif +} + + +//------------------------------------------------------------------------------ + +inline bool XMVector4IsNaN +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + return (XMISNAN(V.vector4_f32[0]) || + XMISNAN(V.vector4_f32[1]) || + XMISNAN(V.vector4_f32[2]) || + XMISNAN(V.vector4_f32[3])); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Test against itself. NaN is always not equal + __n128 vTempNan = vceqq_f32( V, V ); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempNan), vget_high_u8(vTempNan)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + // If any are NaN, the mask is zero + return ( vget_lane_u32(vTemp.val[1], 1) != 0xFFFFFFFFU ); +#elif defined(_XM_SSE_INTRINSICS_) + // Test against itself. NaN is always not equal + XMVECTOR vTempNan = _mm_cmpneq_ps(V,V); + // If any are NaN, the mask is non-zero + return (_mm_movemask_ps(vTempNan)!=0); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline bool XMVector4IsInfinite +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + return (XMISINF(V.vector4_f32[0]) || + XMISINF(V.vector4_f32[1]) || + XMISINF(V.vector4_f32[2]) || + XMISINF(V.vector4_f32[3])); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Mask off the sign bit + __n128 vTempInf = vandq_u32( V, g_XMAbsMask ); + // Compare to infinity + vTempInf = vceqq_f32(vTempInf, g_XMInfinity ); + // If any are infinity, the signs are true. + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempInf), vget_high_u8(vTempInf)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + return ( vget_lane_u32(vTemp.val[1], 1) != 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Mask off the sign bit + XMVECTOR vTemp = _mm_and_ps(V,g_XMAbsMask); + // Compare to infinity + vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity); + // If any are infinity, the signs are true. + return (_mm_movemask_ps(vTemp) != 0); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// Computation operations +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector4Dot +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_f32[0] = + Result.vector4_f32[1] = + Result.vector4_f32[2] = + Result.vector4_f32[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2] + V1.vector4_f32[3] * V2.vector4_f32[3]; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vTemp = vmulq_f32( V1, V2 ); + __n64 v1 = vget_low_f32( vTemp ); + __n64 v2 = vget_high_f32( vTemp ); + v1 = vpadd_f32( v1, v1 ); + v2 = vpadd_f32( v2, v2 ); + v1 = vadd_f32( v1, v2 ); + return vcombine_f32( v1, v1 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp2 = V2; + XMVECTOR vTemp = _mm_mul_ps(V1,vTemp2); + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp,_MM_SHUFFLE(1,0,0,0)); // Copy X to the Z position and Y to the W position + vTemp2 = _mm_add_ps(vTemp2,vTemp); // Add Z = X+Z; W = Y+W; + vTemp = _mm_shuffle_ps(vTemp,vTemp2,_MM_SHUFFLE(0,3,0,0)); // Copy W to the Z position + vTemp = _mm_add_ps(vTemp,vTemp2); // Add Z and W together + return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(2,2,2,2)); // Splat Z and return +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector4Cross +( + FXMVECTOR V1, + FXMVECTOR V2, + FXMVECTOR V3 +) +{ + // [ ((v2.z*v3.w-v2.w*v3.z)*v1.y)-((v2.y*v3.w-v2.w*v3.y)*v1.z)+((v2.y*v3.z-v2.z*v3.y)*v1.w), + // ((v2.w*v3.z-v2.z*v3.w)*v1.x)-((v2.w*v3.x-v2.x*v3.w)*v1.z)+((v2.z*v3.x-v2.x*v3.z)*v1.w), + // ((v2.y*v3.w-v2.w*v3.y)*v1.x)-((v2.x*v3.w-v2.w*v3.x)*v1.y)+((v2.x*v3.y-v2.y*v3.x)*v1.w), + // ((v2.z*v3.y-v2.y*v3.z)*v1.x)-((v2.z*v3.x-v2.x*v3.z)*v1.y)+((v2.y*v3.x-v2.x*v3.y)*v1.z) ] + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR Result; + + Result.vector4_f32[0] = (((V2.vector4_f32[2]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[2]))*V1.vector4_f32[1])-(((V2.vector4_f32[1]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[1]))*V1.vector4_f32[2])+(((V2.vector4_f32[1]*V3.vector4_f32[2])-(V2.vector4_f32[2]*V3.vector4_f32[1]))*V1.vector4_f32[3]); + Result.vector4_f32[1] = (((V2.vector4_f32[3]*V3.vector4_f32[2])-(V2.vector4_f32[2]*V3.vector4_f32[3]))*V1.vector4_f32[0])-(((V2.vector4_f32[3]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[3]))*V1.vector4_f32[2])+(((V2.vector4_f32[2]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[2]))*V1.vector4_f32[3]); + Result.vector4_f32[2] = (((V2.vector4_f32[1]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[1]))*V1.vector4_f32[0])-(((V2.vector4_f32[0]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[0]))*V1.vector4_f32[1])+(((V2.vector4_f32[0]*V3.vector4_f32[1])-(V2.vector4_f32[1]*V3.vector4_f32[0]))*V1.vector4_f32[3]); + Result.vector4_f32[3] = (((V2.vector4_f32[2]*V3.vector4_f32[1])-(V2.vector4_f32[1]*V3.vector4_f32[2]))*V1.vector4_f32[0])-(((V2.vector4_f32[2]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[2]))*V1.vector4_f32[1])+(((V2.vector4_f32[1]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[1]))*V1.vector4_f32[2]); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + const __n64 select = vget_low_f32( g_XMMaskX ); + + // Term1: V2zwyz * V3wzwy + const __n64 v2xy = vget_low_f32(V2); + const __n64 v2zw = vget_high_f32(V2); + const __n64 v2yx = vrev64_f32(v2xy); + const __n64 v2wz = vrev64_f32(v2zw); + const __n64 v2yz = vbsl_f32( select, v2yx, v2wz ); + + const __n64 v3zw = vget_high_f32(V3); + const __n64 v3wz = vrev64_f32(v3zw); + const __n64 v3xy = vget_low_f32(V3); + const __n64 v3wy = vbsl_f32( select, v3wz, v3xy ); + + __n128 vTemp1 = vcombine_f32(v2zw,v2yz); + __n128 vTemp2 = vcombine_f32(v3wz,v3wy); + __n128 vResult = vmulq_f32( vTemp1, vTemp2 ); + + // - V2wzwy * V3zwyz + const __n64 v2wy = vbsl_f32( select, v2wz, v2xy ); + + const __n64 v3yx = vrev64_f32(v3xy); + const __n64 v3yz = vbsl_f32( select, v3yx, v3wz ); + + vTemp1 = vcombine_f32(v2wz,v2wy); + vTemp2 = vcombine_f32(v3zw,v3yz); + vResult = vmlsq_f32( vResult, vTemp1, vTemp2 ); + + // term1 * V1yxxx + const __n64 v1xy = vget_low_f32(V1); + const __n64 v1yx = vrev64_f32(v1xy); + + vTemp1 = vcombine_f32( v1yx, vdup_lane_f32( v1yx, 1 ) ); + vResult = vmulq_f32( vResult, vTemp1 ); + + // Term2: V2ywxz * V3wxwx + const __n64 v2yw = vrev64_f32(v2wy); + const __n64 v2xz = vbsl_f32( select, v2xy, v2wz ); + + const __n64 v3wx = vbsl_f32( select, v3wz, v3yx ); + + vTemp1 = vcombine_f32(v2yw,v2xz); + vTemp2 = vcombine_f32(v3wx,v3wx); + __n128 vTerm = vmulq_f32( vTemp1, vTemp2 ); + + // - V2wxwx * V3ywxz + const __n64 v2wx = vbsl_f32( select, v2wz, v2yx ); + + const __n64 v3yw = vrev64_f32(v3wy); + const __n64 v3xz = vbsl_f32( select, v3xy, v3wz ); + + vTemp1 = vcombine_f32(v2wx,v2wx); + vTemp2 = vcombine_f32(v3yw,v3xz); + vTerm = vmlsq_f32( vTerm, vTemp1, vTemp2 ); + + // vResult - term2 * V1zzyy + const __n64 v1zw = vget_high_f32(V1); + + vTemp1 = vcombine_f32( vdup_lane_f32(v1zw, 0), vdup_lane_f32(v1yx, 0) ); + vResult = vmlsq_f32( vResult, vTerm, vTemp1 ); + + // Term3: V2yzxy * V3zxyx + const __n64 v3zx = vrev64_f32(v3xz); + + vTemp1 = vcombine_f32(v2yz,v2xy); + vTemp2 = vcombine_f32(v3zx,v3yx); + vTerm = vmulq_f32( vTemp1, vTemp2 ); + + // - V2zxyx * V3yzxy + const __n64 v2zx = vrev64_f32(v2xz); + + vTemp1 = vcombine_f32(v2zx,v2yx); + vTemp2 = vcombine_f32(v3yz,v3xy); + vTerm = vmlsq_f32( vTerm, vTemp1, vTemp2 ); + + // vResult + term3 * V1wwwz + const __n64 v1wz = vrev64_f32(v1zw); + + vTemp1 = vcombine_f32( vdup_lane_f32( v1wz, 0 ), v1wz ); + return vmlaq_f32( vResult, vTerm, vTemp1 ); +#elif defined(_XM_SSE_INTRINSICS_) + // V2zwyz * V3wzwy + XMVECTOR vResult = XM_PERMUTE_PS(V2,_MM_SHUFFLE(2,1,3,2)); + XMVECTOR vTemp3 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(1,3,2,3)); + vResult = _mm_mul_ps(vResult,vTemp3); + // - V2wzwy * V3zwyz + XMVECTOR vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(1,3,2,3)); + vTemp3 = XM_PERMUTE_PS(vTemp3,_MM_SHUFFLE(1,3,0,1)); + vTemp2 = _mm_mul_ps(vTemp2,vTemp3); + vResult = _mm_sub_ps(vResult,vTemp2); + // term1 * V1yxxx + XMVECTOR vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(0,0,0,1)); + vResult = _mm_mul_ps(vResult,vTemp1); + + // V2ywxz * V3wxwx + vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(2,0,3,1)); + vTemp3 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(0,3,0,3)); + vTemp3 = _mm_mul_ps(vTemp3,vTemp2); + // - V2wxwx * V3ywxz + vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(2,1,2,1)); + vTemp1 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(2,0,3,1)); + vTemp2 = _mm_mul_ps(vTemp2,vTemp1); + vTemp3 = _mm_sub_ps(vTemp3,vTemp2); + // vResult - temp * V1zzyy + vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(1,1,2,2)); + vTemp1 = _mm_mul_ps(vTemp1,vTemp3); + vResult = _mm_sub_ps(vResult,vTemp1); + + // V2yzxy * V3zxyx + vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(1,0,2,1)); + vTemp3 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(0,1,0,2)); + vTemp3 = _mm_mul_ps(vTemp3,vTemp2); + // - V2zxyx * V3yzxy + vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(2,0,2,1)); + vTemp1 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(1,0,2,1)); + vTemp1 = _mm_mul_ps(vTemp1,vTemp2); + vTemp3 = _mm_sub_ps(vTemp3,vTemp1); + // vResult + term * V1wwwz + vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(2,3,3,3)); + vTemp3 = _mm_mul_ps(vTemp3,vTemp1); + vResult = _mm_add_ps(vResult,vTemp3); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector4LengthSq +( + FXMVECTOR V +) +{ + return XMVector4Dot(V, V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector4ReciprocalLengthEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector4LengthSq(V); + Result = XMVectorReciprocalSqrtEst(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + __n128 vTemp = vmulq_f32( V, V ); + __n64 v1 = vget_low_f32( vTemp ); + __n64 v2 = vget_high_f32( vTemp ); + v1 = vpadd_f32( v1, v1 ); + v2 = vpadd_f32( v2, v2 ); + v1 = vadd_f32( v1, v2 ); + // Reciprocal sqrt (estimate) + v2 = vrsqrte_f32( v1 ); + return vcombine_f32(v2, v2); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2)); + // Get the reciprocal + vLengthSq = _mm_rsqrt_ps(vLengthSq); + return vLengthSq; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector4ReciprocalLength +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector4LengthSq(V); + Result = XMVectorReciprocalSqrt(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + __n128 vTemp = vmulq_f32( V, V ); + __n64 v1 = vget_low_f32( vTemp ); + __n64 v2 = vget_high_f32( vTemp ); + v1 = vpadd_f32( v1, v1 ); + v2 = vpadd_f32( v2, v2 ); + v1 = vadd_f32( v1, v2 ); + // Reciprocal sqrt + __n64 S0 = vrsqrte_f32(v1); + __n64 P0 = vmul_f32( v1, S0 ); + __n64 R0 = vrsqrts_f32( P0, S0 ); + __n64 S1 = vmul_f32( S0, R0 ); + __n64 P1 = vmul_f32( v1, S1 ); + __n64 R1 = vrsqrts_f32( P1, S1 ); + __n64 Result = vmul_f32( S1, R1 ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2)); + // Get the reciprocal + vLengthSq = _mm_sqrt_ps(vLengthSq); + // Accurate! + vLengthSq = _mm_div_ps(g_XMOne,vLengthSq); + return vLengthSq; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector4LengthEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector4LengthSq(V); + Result = XMVectorSqrtEst(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + __n128 vTemp = vmulq_f32( V, V ); + __n64 v1 = vget_low_f32( vTemp ); + __n64 v2 = vget_high_f32( vTemp ); + v1 = vpadd_f32( v1, v1 ); + v2 = vpadd_f32( v2, v2 ); + v1 = vadd_f32( v1, v2 ); + const __n64 zero = vdup_n_u32(0); + __n64 VEqualsZero = vceq_f32( v1, zero ); + // Sqrt (estimate) + __n64 Result = vrsqrte_f32( v1 ); + Result = vmul_f32( v1, Result ); + Result = vbsl_f32( VEqualsZero, zero, Result ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2)); + // Prepare for the division + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector4Length +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + + Result = XMVector4LengthSq(V); + Result = XMVectorSqrt(Result); + + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + __n128 vTemp = vmulq_f32( V, V ); + __n64 v1 = vget_low_f32( vTemp ); + __n64 v2 = vget_high_f32( vTemp ); + v1 = vpadd_f32( v1, v1 ); + v2 = vpadd_f32( v2, v2 ); + v1 = vadd_f32( v1, v2 ); + const __n64 zero = vdup_n_u32(0); + __n64 VEqualsZero = vceq_f32( v1, zero ); + // Sqrt + __n64 S0 = vrsqrte_f32( v1 ); + __n64 P0 = vmul_f32( v1, S0 ); + __n64 R0 = vrsqrts_f32( P0, S0 ); + __n64 S1 = vmul_f32( S0, R0 ); + __n64 P1 = vmul_f32( v1, S1 ); + __n64 R1 = vrsqrts_f32( P1, S1 ); + __n64 Result = vmul_f32( S1, R1 ); + Result = vmul_f32( v1, Result ); + Result = vbsl_f32( VEqualsZero, zero, Result ); + return vcombine_f32( Result, Result ); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2)); + // Prepare for the division + vLengthSq = _mm_sqrt_ps(vLengthSq); + return vLengthSq; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +// XMVector4NormalizeEst uses a reciprocal estimate and +// returns QNaN on zero and infinite vectors. + +inline XMVECTOR XMVector4NormalizeEst +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result = XMVector4ReciprocalLength(V); + Result = XMVectorMultiply(V, Result); + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + __n128 vTemp = vmulq_f32( V, V ); + __n64 v1 = vget_low_f32( vTemp ); + __n64 v2 = vget_high_f32( vTemp ); + v1 = vpadd_f32( v1, v1 ); + v2 = vpadd_f32( v2, v2 ); + v1 = vadd_f32( v1, v2 ); + // Reciprocal sqrt (estimate) + v2 = vrsqrte_f32( v1 ); + // Normalize + return vmulq_f32( V, vcombine_f32(v2,v2) ); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2)); + // Get the reciprocal + XMVECTOR vResult = _mm_rsqrt_ps(vLengthSq); + // Reciprocal mul to perform the normalization + vResult = _mm_mul_ps(vResult,V); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector4Normalize +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + float fLength; + XMVECTOR vResult; + + vResult = XMVector4Length( V ); + fLength = vResult.vector4_f32[0]; + + // Prevent divide by zero + if (fLength > 0) { + fLength = 1.0f/fLength; + } + + vResult.vector4_f32[0] = V.vector4_f32[0]*fLength; + vResult.vector4_f32[1] = V.vector4_f32[1]*fLength; + vResult.vector4_f32[2] = V.vector4_f32[2]*fLength; + vResult.vector4_f32[3] = V.vector4_f32[3]*fLength; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Dot4 + __n128 vTemp = vmulq_f32( V, V ); + __n64 v1 = vget_low_f32( vTemp ); + __n64 v2 = vget_high_f32( vTemp ); + v1 = vpadd_f32( v1, v1 ); + v2 = vpadd_f32( v2, v2 ); + v1 = vadd_f32( v1, v2 ); + __n64 VEqualsZero = vceq_f32( v1, vdup_n_u32(0) ); + __n64 VEqualsInf = vceq_f32( v1, vget_low_f32(g_XMInfinity) ); + // Reciprocal sqrt (2 iterations of Newton-Raphson) + __n64 S0 = vrsqrte_f32( v1 ); + __n64 P0 = vmul_f32( v1, S0 ); + __n64 R0 = vrsqrts_f32( P0, S0 ); + __n64 S1 = vmul_f32( S0, R0 ); + __n64 P1 = vmul_f32( v1, S1 ); + __n64 R1 = vrsqrts_f32( P1, S1 ); + v2 = vmul_f32( S1, R1 ); + // Normalize + __n128 vResult = vmulq_f32( V, vcombine_f32(v2,v2) ); + vResult = vbslq_f32( vcombine_f32(VEqualsZero,VEqualsZero), vdupq_n_f32(0), vResult ); + return vbslq_f32( vcombine_f32(VEqualsInf,VEqualsInf), g_XMQNaN, vResult ); +#elif defined(_XM_SSE_INTRINSICS_) + // Perform the dot product on x,y,z and w + XMVECTOR vLengthSq = _mm_mul_ps(V,V); + // vTemp has z and w + XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2)); + // x+z, y+w + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // x+z,x+z,x+z,y+w + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0)); + // ??,??,y+w,y+w + vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); + // ??,??,x+z+y+w,?? + vLengthSq = _mm_add_ps(vLengthSq,vTemp); + // Splat the length + vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2)); + // Prepare for the division + XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); + // Create zero with a single instruction + XMVECTOR vZeroMask = _mm_setzero_ps(); + // Test for a divide by zero (Must be FP to detect -0.0) + vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); + // Failsafe on zero (Or epsilon) length planes + // If the length is infinity, set the elements to zero + vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); + // Divide to perform the normalization + vResult = _mm_div_ps(V,vResult); + // Any that are infinity, set to zero + vResult = _mm_and_ps(vResult,vZeroMask); + // Select qnan or result based on infinite length + XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); + XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); + vResult = _mm_or_ps(vTemp1,vTemp2); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector4ClampLength +( + FXMVECTOR V, + float LengthMin, + float LengthMax +) +{ + XMVECTOR ClampMax = XMVectorReplicate(LengthMax); + XMVECTOR ClampMin = XMVectorReplicate(LengthMin); + + return XMVector4ClampLengthV(V, ClampMin, ClampMax); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector4ClampLengthV +( + FXMVECTOR V, + FXMVECTOR LengthMin, + FXMVECTOR LengthMax +) +{ + assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetW(LengthMin) == XMVectorGetX(LengthMin))); + assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetW(LengthMax) == XMVectorGetX(LengthMax))); + assert(XMVector4GreaterOrEqual(LengthMin, XMVectorZero())); + assert(XMVector4GreaterOrEqual(LengthMax, XMVectorZero())); + assert(XMVector4GreaterOrEqual(LengthMax, LengthMin)); + + XMVECTOR LengthSq = XMVector4LengthSq(V); + + const XMVECTOR Zero = XMVectorZero(); + + XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq); + + XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v); + XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero); + + XMVECTOR Normal = XMVectorMultiply(V, RcpLength); + + XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength); + + XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength); + Length = XMVectorSelect(LengthSq, Length, Select); + Normal = XMVectorSelect(LengthSq, Normal, Select); + + XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax); + XMVECTOR ControlMin = XMVectorLess(Length, LengthMin); + + XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax); + ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin); + + XMVECTOR Result = XMVectorMultiply(Normal, ClampLength); + + // Preserve the original vector (with no precision loss) if the length falls within the given range + XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin); + Result = XMVectorSelect(Result, V, Control); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector4Reflect +( + FXMVECTOR Incident, + FXMVECTOR Normal +) +{ + // Result = Incident - (2 * dot(Incident, Normal)) * Normal + + XMVECTOR Result = XMVector4Dot(Incident, Normal); + Result = XMVectorAdd(Result, Result); + Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident); + + return Result; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector4Refract +( + FXMVECTOR Incident, + FXMVECTOR Normal, + float RefractionIndex +) +{ + XMVECTOR Index = XMVectorReplicate(RefractionIndex); + return XMVector4RefractV(Incident, Normal, Index); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector4RefractV +( + FXMVECTOR Incident, + FXMVECTOR Normal, + FXMVECTOR RefractionIndex +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR IDotN; + XMVECTOR R; + const XMVECTOR Zero = XMVectorZero(); + + // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + + // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) + + IDotN = XMVector4Dot(Incident, Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v); + R = XMVectorMultiply(R, RefractionIndex); + R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v); + + if (XMVector4LessOrEqual(R, Zero)) + { + // Total internal reflection + return Zero; + } + else + { + XMVECTOR Result; + + // R = RefractionIndex * IDotN + sqrt(R) + R = XMVectorSqrt(R); + R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R); + + // Result = RefractionIndex * Incident - Normal * R + Result = XMVectorMultiply(RefractionIndex, Incident); + Result = XMVectorNegativeMultiplySubtract(Normal, R, Result); + + return Result; + } + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTOR IDotN = XMVector4Dot(Incident,Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + __n128 R = vmlsq_f32( g_XMOne, IDotN, IDotN); + R = vmulq_f32(R, RefractionIndex); + R = vmlsq_f32(g_XMOne, R, RefractionIndex ); + + __n128 vResult = vcleq_f32(R,g_XMZero); + int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); + vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); + if ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ) + { + // Total internal reflection + vResult = g_XMZero; + } + else + { + // Sqrt(R) + __n128 S0 = vrsqrteq_f32(R); + __n128 P0 = vmulq_f32( R, S0 ); + __n128 R0 = vrsqrtsq_f32( P0, S0 ); + __n128 S1 = vmulq_f32( S0, R0 ); + __n128 P1 = vmulq_f32( R, S1 ); + __n128 R1 = vrsqrtsq_f32( P1, S1 ); + __n128 S2 = vmulq_f32( S1, R1 ); + R = vmulq_f32( R, S2 ); + // R = RefractionIndex * IDotN + sqrt(R) + R = vmlaq_f32( R, RefractionIndex, IDotN ); + // Result = RefractionIndex * Incident - Normal * R + vResult = vmulq_f32(RefractionIndex, Incident); + vResult = vmlsq_f32( vResult, R, Normal ); + } + return vResult; +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR IDotN = XMVector4Dot(Incident,Normal); + + // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) + XMVECTOR R = _mm_mul_ps(IDotN,IDotN); + R = _mm_sub_ps(g_XMOne,R); + R = _mm_mul_ps(R, RefractionIndex); + R = _mm_mul_ps(R, RefractionIndex); + R = _mm_sub_ps(g_XMOne,R); + + XMVECTOR vResult = _mm_cmple_ps(R,g_XMZero); + if (_mm_movemask_ps(vResult)==0x0f) + { + // Total internal reflection + vResult = g_XMZero; + } + else + { + // R = RefractionIndex * IDotN + sqrt(R) + R = _mm_sqrt_ps(R); + vResult = _mm_mul_ps(RefractionIndex, IDotN); + R = _mm_add_ps(R,vResult); + // Result = RefractionIndex * Incident - Normal * R + vResult = _mm_mul_ps(RefractionIndex, Incident); + R = _mm_mul_ps(R,Normal); + vResult = _mm_sub_ps(vResult,R); + } + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector4Orthogonal +( + FXMVECTOR V +) +{ +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR Result; + Result.vector4_f32[0] = V.vector4_f32[2]; + Result.vector4_f32[1] = V.vector4_f32[3]; + Result.vector4_f32[2] = -V.vector4_f32[0]; + Result.vector4_f32[3] = -V.vector4_f32[1]; + return Result; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Negate = { 1.f, 1.f, -1.f, -1.f }; + + __n128 Result = vcombine_f32( vget_high_f32( V ), vget_low_f32( V ) ); + return vmulq_f32( Result, Negate ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 FlipZW = {1.0f,1.0f,-1.0f,-1.0f}; + XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,0,3,2)); + vResult = _mm_mul_ps(vResult,FlipZW); + return vResult; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector4AngleBetweenNormalsEst +( + FXMVECTOR N1, + FXMVECTOR N2 +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR Result = XMVector4Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); + Result = XMVectorACosEst(Result); + return Result; + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector4AngleBetweenNormals +( + FXMVECTOR N1, + FXMVECTOR N2 +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR Result = XMVector4Dot(N1, N2); + Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); + Result = XMVectorACos(Result); + return Result; + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector4AngleBetweenVectors +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMVECTOR L1 = XMVector4ReciprocalLength(V1); + XMVECTOR L2 = XMVector4ReciprocalLength(V2); + + XMVECTOR Dot = XMVector4Dot(V1, V2); + + L1 = XMVectorMultiply(L1, L2); + + XMVECTOR CosAngle = XMVectorMultiply(Dot, L1); + CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v); + + return XMVectorACos(CosAngle); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMVector4Transform +( + FXMVECTOR V, + CXMMATRIX M +) +{ +#if defined(_XM_NO_INTRINSICS_) + float fX = (M.m[0][0]*V.vector4_f32[0])+(M.m[1][0]*V.vector4_f32[1])+(M.m[2][0]*V.vector4_f32[2])+(M.m[3][0]*V.vector4_f32[3]); + float fY = (M.m[0][1]*V.vector4_f32[0])+(M.m[1][1]*V.vector4_f32[1])+(M.m[2][1]*V.vector4_f32[2])+(M.m[3][1]*V.vector4_f32[3]); + float fZ = (M.m[0][2]*V.vector4_f32[0])+(M.m[1][2]*V.vector4_f32[1])+(M.m[2][2]*V.vector4_f32[2])+(M.m[3][2]*V.vector4_f32[3]); + float fW = (M.m[0][3]*V.vector4_f32[0])+(M.m[1][3]*V.vector4_f32[1])+(M.m[2][3]*V.vector4_f32[2])+(M.m[3][3]*V.vector4_f32[3]); + XMVECTOR vResult = { + fX, + fY, + fZ, + fW + }; + return vResult; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 VL = vget_low_f32( V ); + XMVECTOR vTemp1 = vdupq_lane_f32( VL, 0 ); // X + XMVECTOR vTemp2 = vdupq_lane_f32( VL, 1 ); // Y + XMVECTOR vResult = vmulq_f32( vTemp1, M.r[0] ); + vResult = vmlaq_f32( vResult, vTemp2, M.r[1] ); + __n64 VH = vget_high_f32( V ); + vTemp1 = vdupq_lane_f32( VH, 0 ); // Z + vTemp2 = vdupq_lane_f32( VH, 1 ); // W + vResult = vmlaq_f32( vResult, vTemp1, M.r[2] ); + return vmlaq_f32( vResult, vTemp2, M.r[3] ); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat x,y,z and w + XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); + XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); + // Mul by the matrix + vTempX = _mm_mul_ps(vTempX,M.r[0]); + vTempY = _mm_mul_ps(vTempY,M.r[1]); + vTempZ = _mm_mul_ps(vTempZ,M.r[2]); + vTempW = _mm_mul_ps(vTempW,M.r[3]); + // Add them all together + vTempX = _mm_add_ps(vTempX,vTempY); + vTempZ = _mm_add_ps(vTempZ,vTempW); + vTempX = _mm_add_ps(vTempX,vTempZ); + return vTempX; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMFLOAT4* XMVector4TransformStream +( + XMFLOAT4* pOutputStream, + size_t OutputStride, + const XMFLOAT4* pInputStream, + size_t InputStride, + size_t VectorCount, + CXMMATRIX M +) +{ + assert(pOutputStream != NULL); + assert(pInputStream != NULL); + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) || defined(_XM_ARM_NEON_INTRINSICS_) + + const uint8_t* pInputVector = (const uint8_t*)pInputStream; + uint8_t* pOutputVector = (uint8_t*)pOutputStream; + + const XMVECTOR row0 = M.r[0]; + const XMVECTOR row1 = M.r[1]; + const XMVECTOR row2 = M.r[2]; + const XMVECTOR row3 = M.r[3]; + + for (size_t i = 0; i < VectorCount; i++) + { + XMVECTOR V = XMLoadFloat4((const XMFLOAT4*)pInputVector); + XMVECTOR W = XMVectorSplatW(V); + XMVECTOR Z = XMVectorSplatZ(V); + XMVECTOR Y = XMVectorSplatY(V); + XMVECTOR X = XMVectorSplatX(V); + + XMVECTOR Result = XMVectorMultiply(W, row3); + Result = XMVectorMultiplyAdd(Z, row2, Result); + Result = XMVectorMultiplyAdd(Y, row1, Result); + Result = XMVectorMultiplyAdd(X, row0, Result); + + XMStoreFloat4((XMFLOAT4*)pOutputVector, Result); + + pInputVector += InputStride; + pOutputVector += OutputStride; + } + + return pOutputStream; + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +/**************************************************************************** + * + * XMVECTOR operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline XMVECTOR operator+ (FXMVECTOR V) +{ + return V; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR operator- (FXMVECTOR V) +{ + return XMVectorNegate(V); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& operator+= +( + XMVECTOR& V1, + FXMVECTOR V2 +) +{ + V1 = XMVectorAdd(V1, V2); + return V1; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& operator-= +( + XMVECTOR& V1, + FXMVECTOR V2 +) +{ + V1 = XMVectorSubtract(V1, V2); + return V1; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& operator*= +( + XMVECTOR& V1, + FXMVECTOR V2 +) +{ + V1 = XMVectorMultiply(V1, V2); + return V1; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& operator/= +( + XMVECTOR& V1, + FXMVECTOR V2 +) +{ + V1 = XMVectorDivide(V1,V2); + return V1; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& operator*= +( + XMVECTOR& V, + const float S +) +{ + V = XMVectorScale(V, S); + return V; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR& operator/= +( + XMVECTOR& V, + const float S +) +{ + assert( S != 0.0f ); + V = XMVectorScale(V, 1.0f / S); + return V; +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR operator+ +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + return XMVectorAdd(V1, V2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR operator- +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + return XMVectorSubtract(V1, V2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR operator* +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + return XMVectorMultiply(V1, V2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR operator/ +( + FXMVECTOR V1, + FXMVECTOR V2 +) +{ + return XMVectorDivide(V1,V2); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR operator* +( + FXMVECTOR V, + const float S +) +{ + return XMVectorScale(V, S); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR operator/ +( + FXMVECTOR V, + const float S +) +{ + assert( S != 0.0f ); + return XMVectorScale(V, 1.0f / S); +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR operator* +( + float S, + FXMVECTOR V +) +{ + return XMVectorScale(V, S); +} + +#if defined(_XM_NO_INTRINSICS_) +#undef XMISNAN +#undef XMISINF +#endif + + diff --git a/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXPackedVector.h b/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXPackedVector.h new file mode 100644 index 00000000..66df02fd --- /dev/null +++ b/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXPackedVector.h @@ -0,0 +1,995 @@ +//------------------------------------------------------------------------------------- +// DirectXPackedVector.h -- SIMD C++ Math library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------------------------------- + +#ifdef _MSC_VER +#pragma once +#endif + +#include "DirectXMath.h" + +namespace DirectX +{ + +namespace PackedVector +{ + +#ifdef _XM_BIGENDIAN_ +#pragma bitfield_order(push) +#pragma bitfield_order(lsb_to_msb) +#endif + +#pragma warning(push) +#pragma warning(disable:4201 4365 4324) + +//------------------------------------------------------------------------------ +// ARGB Color; 8-8-8-8 bit unsigned normalized integer components packed into +// a 32 bit integer. The normalized color is packed into 32 bits using 8 bit +// unsigned, normalized integers for the alpha, red, green, and blue components. +// The alpha component is stored in the most significant bits and the blue +// component in the least significant bits (A8R8G8B8): +// [32] aaaaaaaa rrrrrrrr gggggggg bbbbbbbb [0] +struct XMCOLOR +{ + union + { + struct + { + uint8_t b; // Blue: 0/255 to 255/255 + uint8_t g; // Green: 0/255 to 255/255 + uint8_t r; // Red: 0/255 to 255/255 + uint8_t a; // Alpha: 0/255 to 255/255 + }; + uint32_t c; + }; + + XMCOLOR() {} + XMCOLOR(uint32_t Color) : c(Color) {} + XMCOLOR(float _r, float _g, float _b, float _a); + explicit XMCOLOR(_In_reads_(4) const float *pArray); + + operator uint32_t () const { return c; } + + XMCOLOR& operator= (const XMCOLOR& Color) { c = Color.c; return *this; } + XMCOLOR& operator= (const uint32_t Color) { c = Color; return *this; } +}; + +//------------------------------------------------------------------------------ +// 16 bit floating point number consisting of a sign bit, a 5 bit biased +// exponent, and a 10 bit mantissa +typedef uint16_t HALF; + +//------------------------------------------------------------------------------ +// 2D Vector; 16 bit floating point components +struct XMHALF2 +{ + union + { + struct + { + HALF x; + HALF y; + }; + uint32_t v; + }; + + XMHALF2() {} + explicit XMHALF2(uint32_t Packed) : v(Packed) {} + XMHALF2(HALF _x, HALF _y) : x(_x), y(_y) {} + explicit XMHALF2(_In_reads_(2) const HALF *pArray) : x(pArray[0]), y(pArray[1]) {} + XMHALF2(float _x, float _y); + explicit XMHALF2(_In_reads_(2) const float *pArray); + + XMHALF2& operator= (const XMHALF2& Half2) { x = Half2.x; y = Half2.y; return *this; } + XMHALF2& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 2D Vector; 16 bit signed normalized integer components +struct XMSHORTN2 +{ + union + { + struct + { + int16_t x; + int16_t y; + }; + uint32_t v; + }; + + XMSHORTN2() {} + explicit XMSHORTN2(uint32_t Packed) : v(Packed) {} + XMSHORTN2(int16_t _x, int16_t _y) : x(_x), y(_y) {} + explicit XMSHORTN2(_In_reads_(2) const int16_t *pArray) : x(pArray[0]), y(pArray[1]) {} + XMSHORTN2(float _x, float _y); + explicit XMSHORTN2(_In_reads_(2) const float *pArray); + + XMSHORTN2& operator= (const XMSHORTN2& ShortN2) { x = ShortN2.x; y = ShortN2.y; return *this; } + XMSHORTN2& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 2D Vector; 16 bit signed integer components +struct XMSHORT2 +{ + union + { + struct + { + int16_t x; + int16_t y; + }; + uint32_t v; + }; + + XMSHORT2() {} + explicit XMSHORT2(uint32_t Packed) : v(Packed) {} + XMSHORT2(int16_t _x, int16_t _y) : x(_x), y(_y) {} + explicit XMSHORT2(_In_reads_(2) const int16_t *pArray) : x(pArray[0]), y(pArray[1]) {} + XMSHORT2(float _x, float _y); + explicit XMSHORT2(_In_reads_(2) const float *pArray); + + XMSHORT2& operator= (const XMSHORT2& Short2) { x = Short2.x; y = Short2.y; return *this; } + XMSHORT2& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 2D Vector; 16 bit unsigned normalized integer components +struct XMUSHORTN2 +{ + union + { + struct + { + uint16_t x; + uint16_t y; + }; + uint32_t v; + }; + + XMUSHORTN2() {} + explicit XMUSHORTN2(uint32_t Packed) : v(Packed) {} + XMUSHORTN2(uint16_t _x, uint16_t _y) : x(_x), y(_y) {} + explicit XMUSHORTN2(_In_reads_(2) const uint16_t *pArray) : x(pArray[0]), y(pArray[1]) {} + XMUSHORTN2(float _x, float _y); + explicit XMUSHORTN2(_In_reads_(2) const float *pArray); + + XMUSHORTN2& operator= (const XMUSHORTN2& UShortN2) { x = UShortN2.x; y = UShortN2.y; return *this; } + XMUSHORTN2& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 2D Vector; 16 bit unsigned integer components +struct XMUSHORT2 +{ + union + { + struct + { + uint16_t x; + uint16_t y; + }; + uint32_t v; + }; + + XMUSHORT2() {} + explicit XMUSHORT2(uint32_t Packed) : v(Packed) {} + XMUSHORT2(uint16_t _x, uint16_t _y) : x(_x), y(_y) {} + explicit XMUSHORT2(_In_reads_(2) const uint16_t *pArray) : x(pArray[0]), y(pArray[1]) {} + XMUSHORT2(float _x, float _y); + explicit XMUSHORT2(_In_reads_(2) const float *pArray); + + XMUSHORT2& operator= (const XMUSHORT2& UShort2) { x = UShort2.x; y = UShort2.y; return *this; } + XMUSHORT2& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 2D Vector; 8 bit signed normalized integer components +struct XMBYTEN2 +{ + union + { + struct + { + int8_t x; + int8_t y; + }; + uint16_t v; + }; + + XMBYTEN2() {} + explicit XMBYTEN2(uint16_t Packed) : v(Packed) {} + XMBYTEN2(int8_t _x, int8_t _y) : x(_x), y(_y) {} + explicit XMBYTEN2(_In_reads_(2) const int8_t *pArray) : x(pArray[0]), y(pArray[1]) {} + XMBYTEN2(float _x, float _y); + explicit XMBYTEN2(_In_reads_(2) const float *pArray); + + XMBYTEN2& operator= (const XMBYTEN2& ByteN2) { x = ByteN2.x; y = ByteN2.y; return *this; } + XMBYTEN2& operator= (uint16_t Packed) { v = Packed; return *this; } +}; + +// 2D Vector; 8 bit signed integer components +struct XMBYTE2 +{ + union + { + struct + { + int8_t x; + int8_t y; + }; + uint16_t v; + }; + + XMBYTE2() {} + explicit XMBYTE2(uint16_t Packed) : v(Packed) {} + XMBYTE2(int8_t _x, int8_t _y) : x(_x), y(_y) {} + explicit XMBYTE2(_In_reads_(2) const int8_t *pArray) : x(pArray[0]), y(pArray[1]) {} + XMBYTE2(float _x, float _y); + explicit XMBYTE2(_In_reads_(2) const float *pArray); + + XMBYTE2& operator= (const XMBYTE2& Byte2) { x = Byte2.x; y = Byte2.y; return *this; } + XMBYTE2& operator= (uint16_t Packed) { v = Packed; return *this; } +}; + +// 2D Vector; 8 bit unsigned normalized integer components +struct XMUBYTEN2 +{ + union + { + struct + { + uint8_t x; + uint8_t y; + }; + uint16_t v; + }; + + XMUBYTEN2() {} + explicit XMUBYTEN2(uint16_t Packed) : v(Packed) {} + XMUBYTEN2(uint8_t _x, uint8_t _y) : x(_x), y(_y) {} + explicit XMUBYTEN2(_In_reads_(2) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]) {} + XMUBYTEN2(float _x, float _y); + explicit XMUBYTEN2(_In_reads_(2) const float *pArray); + + XMUBYTEN2& operator= (const XMUBYTEN2& UByteN2) { x = UByteN2.x; y = UByteN2.y; return *this; } + XMUBYTEN2& operator= (uint16_t Packed) { v = Packed; return *this; } +}; + +// 2D Vector; 8 bit unsigned integer components +struct XMUBYTE2 +{ + union + { + struct + { + uint8_t x; + uint8_t y; + }; + uint16_t v; + }; + + XMUBYTE2() {} + explicit XMUBYTE2(uint16_t Packed) : v(Packed) {} + XMUBYTE2(uint8_t _x, uint8_t _y) : x(_x), y(_y) {} + explicit XMUBYTE2(_In_reads_(2) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]) {} + XMUBYTE2(float _x, float _y); + explicit XMUBYTE2(_In_reads_(2) const float *pArray); + + XMUBYTE2& operator= (const XMUBYTE2& UByte2) { x = UByte2.x; y = UByte2.y; return *this; } + XMUBYTE2& operator= (uint16_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 3D vector: 5/6/5 unsigned integer components +struct XMU565 +{ + union + { + struct + { + uint16_t x : 5; // 0 to 31 + uint16_t y : 6; // 0 to 63 + uint16_t z : 5; // 0 to 31 + }; + uint16_t v; + }; + + XMU565() {} + explicit XMU565(uint16_t Packed) : v(Packed) {} + XMU565(uint8_t _x, uint8_t _y, uint8_t _z) : x(_x), y(_y), z(_z) {} + explicit XMU565(_In_reads_(3) const int8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]) {} + XMU565(float _x, float _y, float _z); + explicit XMU565(_In_reads_(3) const float *pArray); + + operator uint16_t () const { return v; } + + XMU565& operator= (const XMU565& U565) { v = U565.v; return *this; } + XMU565& operator= (uint16_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 3D vector: 11/11/10 floating-point components +// The 3D vector is packed into 32 bits as follows: a 5-bit biased exponent +// and 6-bit mantissa for x component, a 5-bit biased exponent and +// 6-bit mantissa for y component, a 5-bit biased exponent and a 5-bit +// mantissa for z. The z component is stored in the most significant bits +// and the x component in the least significant bits. No sign bits so +// all partial-precision numbers are positive. +// (Z10Y11X11): [32] ZZZZZzzz zzzYYYYY yyyyyyXX XXXxxxxx [0] +struct XMFLOAT3PK +{ + union + { + struct + { + uint32_t xm : 6; // x-mantissa + uint32_t xe : 5; // x-exponent + uint32_t ym : 6; // y-mantissa + uint32_t ye : 5; // y-exponent + uint32_t zm : 5; // z-mantissa + uint32_t ze : 5; // z-exponent + }; + uint32_t v; + }; + + XMFLOAT3PK() {} + explicit XMFLOAT3PK(uint32_t Packed) : v(Packed) {} + XMFLOAT3PK(float _x, float _y, float _z); + explicit XMFLOAT3PK(_In_reads_(3) const float *pArray); + + operator uint32_t () const { return v; } + + XMFLOAT3PK& operator= (const XMFLOAT3PK& float3pk) { v = float3pk.v; return *this; } + XMFLOAT3PK& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 3D vector: 9/9/9 floating-point components with shared 5-bit exponent +// The 3D vector is packed into 32 bits as follows: a 5-bit biased exponent +// with 9-bit mantissa for the x, y, and z component. The shared exponent +// is stored in the most significant bits and the x component mantissa is in +// the least significant bits. No sign bits so all partial-precision numbers +// are positive. +// (E5Z9Y9X9): [32] EEEEEzzz zzzzzzyy yyyyyyyx xxxxxxxx [0] +struct XMFLOAT3SE +{ + union + { + struct + { + uint32_t xm : 9; // x-mantissa + uint32_t ym : 9; // y-mantissa + uint32_t zm : 9; // z-mantissa + uint32_t e : 5; // shared exponent + }; + uint32_t v; + }; + + XMFLOAT3SE() {} + explicit XMFLOAT3SE(uint32_t Packed) : v(Packed) {} + XMFLOAT3SE(float _x, float _y, float _z); + explicit XMFLOAT3SE(_In_reads_(3) const float *pArray); + + operator uint32_t () const { return v; } + + XMFLOAT3SE& operator= (const XMFLOAT3SE& float3se) { v = float3se.v; return *this; } + XMFLOAT3SE& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 4D Vector; 16 bit floating point components +struct XMHALF4 +{ + union + { + struct + { + HALF x; + HALF y; + HALF z; + HALF w; + }; + uint64_t v; + }; + + XMHALF4() {} + explicit XMHALF4(uint64_t Packed) : v(Packed) {} + XMHALF4(HALF _x, HALF _y, HALF _z, HALF _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMHALF4(_In_reads_(4) const HALF *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMHALF4(float _x, float _y, float _z, float _w); + explicit XMHALF4(_In_reads_(4) const float *pArray); + + XMHALF4& operator= (const XMHALF4& Half4) { x = Half4.x; y = Half4.y; z = Half4.z; w = Half4.w; return *this; } + XMHALF4& operator= (uint64_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 4D Vector; 16 bit signed normalized integer components +struct XMSHORTN4 +{ + union + { + struct + { + int16_t x; + int16_t y; + int16_t z; + int16_t w; + }; + uint64_t v; + }; + + XMSHORTN4() {} + explicit XMSHORTN4(uint64_t Packed) : v(Packed) {} + XMSHORTN4(int16_t _x, int16_t _y, int16_t _z, int16_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMSHORTN4(_In_reads_(4) const int16_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMSHORTN4(float _x, float _y, float _z, float _w); + explicit XMSHORTN4(_In_reads_(4) const float *pArray); + + XMSHORTN4& operator= (const XMSHORTN4& ShortN4) { x = ShortN4.x; y = ShortN4.y; z = ShortN4.z; w = ShortN4.w; return *this; } + XMSHORTN4& operator= (uint64_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 16 bit signed integer components +struct XMSHORT4 +{ + union + { + struct + { + int16_t x; + int16_t y; + int16_t z; + int16_t w; + }; + uint64_t v; + }; + + XMSHORT4() {} + explicit XMSHORT4(uint64_t Packed) : v(Packed) {} + XMSHORT4(int16_t _x, int16_t _y, int16_t _z, int16_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMSHORT4(_In_reads_(4) const int16_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMSHORT4(float _x, float _y, float _z, float _w); + explicit XMSHORT4(_In_reads_(4) const float *pArray); + + XMSHORT4& operator= (const XMSHORT4& Short4) { x = Short4.x; y = Short4.y; z = Short4.z; w = Short4.w; return *this; } + XMSHORT4& operator= (uint64_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 16 bit unsigned normalized integer components +struct XMUSHORTN4 +{ + union + { + struct + { + uint16_t x; + uint16_t y; + uint16_t z; + uint16_t w; + }; + uint64_t v; + }; + + XMUSHORTN4() {} + explicit XMUSHORTN4(uint64_t Packed) : v(Packed) {} + XMUSHORTN4(uint16_t _x, uint16_t _y, uint16_t _z, uint16_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMUSHORTN4(_In_reads_(4) const uint16_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMUSHORTN4(float _x, float _y, float _z, float _w); + explicit XMUSHORTN4(_In_reads_(4) const float *pArray); + + XMUSHORTN4& operator= (const XMUSHORTN4& UShortN4) { x = UShortN4.x; y = UShortN4.y; z = UShortN4.z; w = UShortN4.w; return *this; } + XMUSHORTN4& operator= (uint64_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 16 bit unsigned integer components +struct XMUSHORT4 +{ + union + { + struct + { + uint16_t x; + uint16_t y; + uint16_t z; + uint16_t w; + }; + uint64_t v; + }; + + XMUSHORT4() {} + explicit XMUSHORT4(uint64_t Packed) : v(Packed) {} + XMUSHORT4(uint16_t _x, uint16_t _y, uint16_t _z, uint16_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMUSHORT4(_In_reads_(4) const uint16_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMUSHORT4(float _x, float _y, float _z, float _w); + explicit XMUSHORT4(_In_reads_(4) const float *pArray); + + XMUSHORT4& operator= (const XMUSHORT4& UShort4) { x = UShort4.x; y = UShort4.y; z = UShort4.z; w = UShort4.w; return *this; } + XMUSHORT4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer +// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned, +// normalized integer for the w component and 10 bit signed, normalized +// integers for the z, y, and x components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] +struct XMXDECN4 +{ + union + { + struct + { + int32_t x : 10; // -511/511 to 511/511 + int32_t y : 10; // -511/511 to 511/511 + int32_t z : 10; // -511/511 to 511/511 + uint32_t w : 2; // 0/3 to 3/3 + }; + uint32_t v; + }; + + XMXDECN4() {} + explicit XMXDECN4(uint32_t Packed) : v(Packed) {} + XMXDECN4(float _x, float _y, float _z, float _w); + explicit XMXDECN4(_In_reads_(4) const float *pArray); + + operator uint32_t () const { return v; } + + XMXDECN4& operator= (const XMXDECN4& XDecN4) { v = XDecN4.v; return *this; } + XMXDECN4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer +// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned +// integer for the w component and 10 bit signed integers for the +// z, y, and x components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] +struct XMXDEC4 +{ + union + { + struct + { + int32_t x : 10; // -511 to 511 + int32_t y : 10; // -511 to 511 + int32_t z : 10; // -511 to 511 + uint32_t w : 2; // 0 to 3 + }; + uint32_t v; + }; + + XMXDEC4() {} + explicit XMXDEC4(uint32_t Packed) : v(Packed) {} + XMXDEC4(float _x, float _y, float _z, float _w); + explicit XMXDEC4(_In_reads_(4) const float *pArray); + + operator uint32_t () const { return v; } + + XMXDEC4& operator= (const XMXDEC4& XDec4) { v = XDec4.v; return *this; } + XMXDEC4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer +// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit signed, +// normalized integer for the w component and 10 bit signed, normalized +// integers for the z, y, and x components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] +struct XMDECN4 +{ + union + { + struct + { + int32_t x : 10; // -511/511 to 511/511 + int32_t y : 10; // -511/511 to 511/511 + int32_t z : 10; // -511/511 to 511/511 + int32_t w : 2; // -1/1 to 1/1 + }; + uint32_t v; + }; + + XMDECN4() {} + explicit XMDECN4(uint32_t Packed) : v(Packed) {} + XMDECN4(float _x, float _y, float _z, float _w); + explicit XMDECN4(_In_reads_(4) const float *pArray); + + operator uint32_t () const { return v; } + + XMDECN4& operator= (const XMDECN4& DecN4) { v = DecN4.v; return *this; } + XMDECN4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer +// The 4D Vector is packed into 32 bits as follows: a 2 bit signed, +// integer for the w component and 10 bit signed integers for the +// z, y, and x components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] +struct XMDEC4 +{ + union + { + struct + { + int32_t x : 10; // -511 to 511 + int32_t y : 10; // -511 to 511 + int32_t z : 10; // -511 to 511 + int32_t w : 2; // -1 to 1 + }; + uint32_t v; + }; + + XMDEC4() {} + explicit XMDEC4(uint32_t Packed) : v(Packed) {} + XMDEC4(float _x, float _y, float _z, float _w); + explicit XMDEC4(_In_reads_(4) const float *pArray); + + operator uint32_t () const { return v; } + + XMDEC4& operator= (const XMDEC4& Dec4) { v = Dec4.v; return *this; } + XMDEC4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 10-10-10-2 bit normalized components packed into a 32 bit integer +// The normalized 4D Vector is packed into 32 bits as follows: a 2 bit unsigned, +// normalized integer for the w component and 10 bit unsigned, normalized +// integers for the z, y, and x components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] +struct XMUDECN4 +{ + union + { + struct + { + uint32_t x : 10; // 0/1023 to 1023/1023 + uint32_t y : 10; // 0/1023 to 1023/1023 + uint32_t z : 10; // 0/1023 to 1023/1023 + uint32_t w : 2; // 0/3 to 3/3 + }; + uint32_t v; + }; + + XMUDECN4() {} + explicit XMUDECN4(uint32_t Packed) : v(Packed) {} + XMUDECN4(float _x, float _y, float _z, float _w); + explicit XMUDECN4(_In_reads_(4) const float *pArray); + + operator uint32_t () const { return v; } + + XMUDECN4& operator= (const XMUDECN4& UDecN4) { v = UDecN4.v; return *this; } + XMUDECN4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 10-10-10-2 bit components packed into a 32 bit integer +// The 4D Vector is packed into 32 bits as follows: a 2 bit unsigned, +// integer for the w component and 10 bit unsigned integers +// for the z, y, and x components. The w component is stored in the +// most significant bits and the x component in the least significant bits +// (W2Z10Y10X10): [32] wwzzzzzz zzzzyyyy yyyyyyxx xxxxxxxx [0] +struct XMUDEC4 +{ + union + { + struct + { + uint32_t x : 10; // 0 to 1023 + uint32_t y : 10; // 0 to 1023 + uint32_t z : 10; // 0 to 1023 + uint32_t w : 2; // 0 to 3 + }; + uint32_t v; + }; + + XMUDEC4() {} + explicit XMUDEC4(uint32_t Packed) : v(Packed) {} + XMUDEC4(float _x, float _y, float _z, float _w); + explicit XMUDEC4(_In_reads_(4) const float *pArray); + + operator uint32_t () const { return v; } + + XMUDEC4& operator= (const XMUDEC4& UDec4) { v = UDec4.v; return *this; } + XMUDEC4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 4D Vector; 8 bit signed normalized integer components +struct XMBYTEN4 +{ + union + { + struct + { + int8_t x; + int8_t y; + int8_t z; + int8_t w; + }; + uint32_t v; + }; + + XMBYTEN4() {} + XMBYTEN4(int8_t _x, int8_t _y, int8_t _z, int8_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMBYTEN4(uint32_t Packed) : v(Packed) {} + explicit XMBYTEN4(_In_reads_(4) const int8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMBYTEN4(float _x, float _y, float _z, float _w); + explicit XMBYTEN4(_In_reads_(4) const float *pArray); + + XMBYTEN4& operator= (const XMBYTEN4& ByteN4) { x = ByteN4.x; y = ByteN4.y; z = ByteN4.z; w = ByteN4.w; return *this; } + XMBYTEN4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 8 bit signed integer components +struct XMBYTE4 +{ + union + { + struct + { + int8_t x; + int8_t y; + int8_t z; + int8_t w; + }; + uint32_t v; + }; + + XMBYTE4() {} + XMBYTE4(int8_t _x, int8_t _y, int8_t _z, int8_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMBYTE4(uint32_t Packed) : v(Packed) {} + explicit XMBYTE4(_In_reads_(4) const int8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMBYTE4(float _x, float _y, float _z, float _w); + explicit XMBYTE4(_In_reads_(4) const float *pArray); + + XMBYTE4& operator= (const XMBYTE4& Byte4) { x = Byte4.x; y = Byte4.y; z = Byte4.z; w = Byte4.w; return *this; } + XMBYTE4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 8 bit unsigned normalized integer components +struct XMUBYTEN4 +{ + union + { + struct + { + uint8_t x; + uint8_t y; + uint8_t z; + uint8_t w; + }; + uint32_t v; + }; + + XMUBYTEN4() {} + XMUBYTEN4(uint8_t _x, uint8_t _y, uint8_t _z, uint8_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMUBYTEN4(uint32_t Packed) : v(Packed) {} + explicit XMUBYTEN4(_In_reads_(4) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMUBYTEN4(float _x, float _y, float _z, float _w); + explicit XMUBYTEN4(_In_reads_(4) const float *pArray); + + XMUBYTEN4& operator= (const XMUBYTEN4& UByteN4) { x = UByteN4.x; y = UByteN4.y; z = UByteN4.z; w = UByteN4.w; return *this; } + XMUBYTEN4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +// 4D Vector; 8 bit unsigned integer components +struct XMUBYTE4 +{ + union + { + struct + { + uint8_t x; + uint8_t y; + uint8_t z; + uint8_t w; + }; + uint32_t v; + }; + + XMUBYTE4() {} + XMUBYTE4(uint8_t _x, uint8_t _y, uint8_t _z, uint8_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMUBYTE4(uint32_t Packed) : v(Packed) {} + explicit XMUBYTE4(_In_reads_(4) const uint8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMUBYTE4(float _x, float _y, float _z, float _w); + explicit XMUBYTE4(_In_reads_(4) const float *pArray); + + XMUBYTE4& operator= (const XMUBYTE4& UByte4) { x = UByte4.x; y = UByte4.y; z = UByte4.z; w = UByte4.w; return *this; } + XMUBYTE4& operator= (uint32_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 4D vector; 4 bit unsigned integer components +struct XMUNIBBLE4 +{ + union + { + struct + { + uint16_t x : 4; // 0 to 15 + uint16_t y : 4; // 0 to 15 + uint16_t z : 4; // 0 to 15 + uint16_t w : 4; // 0 to 15 + }; + uint16_t v; + }; + + XMUNIBBLE4() {} + explicit XMUNIBBLE4(uint16_t Packed) : v(Packed) {} + XMUNIBBLE4(int8_t _x, int8_t _y, int8_t _z, int8_t _w) : x(_x), y(_y), z(_z), w(_w) {} + explicit XMUNIBBLE4(_In_reads_(4) const int8_t *pArray) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(pArray[3]) {} + XMUNIBBLE4(float _x, float _y, float _z, float _w); + explicit XMUNIBBLE4(_In_reads_(4) const float *pArray); + + operator uint16_t () const { return v; } + + XMUNIBBLE4& operator= (const XMUNIBBLE4& UNibble4) { v = UNibble4.v; return *this; } + XMUNIBBLE4& operator= (uint16_t Packed) { v = Packed; return *this; } +}; + +//------------------------------------------------------------------------------ +// 4D vector: 5/5/5/1 unsigned integer components +struct XMU555 +{ + union + { + struct + { + uint16_t x : 5; // 0 to 31 + uint16_t y : 5; // 0 to 31 + uint16_t z : 5; // 0 to 31 + uint16_t w : 1; // 0 or 1 + }; + uint16_t v; + }; + + XMU555() {} + explicit XMU555(uint16_t Packed) : v(Packed) {} + XMU555(int8_t _x, int8_t _y, int8_t _z, bool _w) : x(_x), y(_y), z(_z), w(_w ? 0x1 : 0) {} + XMU555(_In_reads_(3) const int8_t *pArray, _In_ bool _w) : x(pArray[0]), y(pArray[1]), z(pArray[2]), w(_w ? 0x1 : 0) {} + XMU555(float _x, float _y, float _z, bool _w); + XMU555(_In_reads_(3) const float *pArray, _In_ bool _w); + + operator uint16_t () const { return v; } + + XMU555& operator= (const XMU555& U555) { v = U555.v; return *this; } + XMU555& operator= (uint16_t Packed) { v = Packed; return *this; } +}; + + +#pragma warning(pop) + +#ifdef _XM_BIGENDIAN_ +#pragma bitfield_order(pop) +#endif + + +/**************************************************************************** + * + * Data conversion operations + * + ****************************************************************************/ + +float XMConvertHalfToFloat(HALF Value); +float* XMConvertHalfToFloatStream(_Out_writes_bytes_(sizeof(float)+OutputStride*(HalfCount-1)) float* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(HALF)+InputStride*(HalfCount-1)) const HALF* pInputStream, + _In_ size_t InputStride, _In_ size_t HalfCount); +HALF XMConvertFloatToHalf(float Value); +HALF* XMConvertFloatToHalfStream(_Out_writes_bytes_(sizeof(HALF)+OutputStride*(FloatCount-1)) HALF* pOutputStream, + _In_ size_t OutputStride, + _In_reads_bytes_(sizeof(float)+InputStride*(FloatCount-1)) const float* pInputStream, + _In_ size_t InputStride, _In_ size_t FloatCount); + +/**************************************************************************** + * + * Load operations + * + ****************************************************************************/ + +XMVECTOR XMLoadColor(_In_ const XMCOLOR* pSource); + +XMVECTOR XMLoadHalf2(_In_ const XMHALF2* pSource); +XMVECTOR XMLoadShortN2(_In_ const XMSHORTN2* pSource); +XMVECTOR XMLoadShort2(_In_ const XMSHORT2* pSource); +XMVECTOR XMLoadUShortN2(_In_ const XMUSHORTN2* pSource); +XMVECTOR XMLoadUShort2(_In_ const XMUSHORT2* pSource); +XMVECTOR XMLoadByteN2(_In_ const XMBYTEN2* pSource); +XMVECTOR XMLoadByte2(_In_ const XMBYTE2* pSource); +XMVECTOR XMLoadUByteN2(_In_ const XMUBYTEN2* pSource); +XMVECTOR XMLoadUByte2(_In_ const XMUBYTE2* pSource); + +XMVECTOR XMLoadU565(_In_ const XMU565* pSource); +XMVECTOR XMLoadFloat3PK(_In_ const XMFLOAT3PK* pSource); +XMVECTOR XMLoadFloat3SE(_In_ const XMFLOAT3SE* pSource); + +XMVECTOR XMLoadHalf4(_In_ const XMHALF4* pSource); +XMVECTOR XMLoadShortN4(_In_ const XMSHORTN4* pSource); +XMVECTOR XMLoadShort4(_In_ const XMSHORT4* pSource); +XMVECTOR XMLoadUShortN4(_In_ const XMUSHORTN4* pSource); +XMVECTOR XMLoadUShort4(_In_ const XMUSHORT4* pSource); +XMVECTOR XMLoadXDecN4(_In_ const XMXDECN4* pSource); +XMVECTOR XMLoadXDec4(_In_ const XMXDEC4* pSource); +XMVECTOR XMLoadDecN4(_In_ const XMDECN4* pSource); +XMVECTOR XMLoadDec4(_In_ const XMDEC4* pSource); +XMVECTOR XMLoadUDecN4(_In_ const XMUDECN4* pSource); +XMVECTOR XMLoadUDec4(_In_ const XMUDEC4* pSource); +XMVECTOR XMLoadByteN4(_In_ const XMBYTEN4* pSource); +XMVECTOR XMLoadByte4(_In_ const XMBYTE4* pSource); +XMVECTOR XMLoadUByteN4(_In_ const XMUBYTEN4* pSource); +XMVECTOR XMLoadUByte4(_In_ const XMUBYTE4* pSource); +XMVECTOR XMLoadUNibble4(_In_ const XMUNIBBLE4* pSource); +XMVECTOR XMLoadU555(_In_ const XMU555* pSource); + + +/**************************************************************************** + * + * Store operations + * + ****************************************************************************/ + +void XMStoreColor(_Out_ XMCOLOR* pDestination, _In_ FXMVECTOR V); + +void XMStoreHalf2(_Out_ XMHALF2* pDestination, _In_ FXMVECTOR V); +void XMStoreShortN2(_Out_ XMSHORTN2* pDestination, _In_ FXMVECTOR V); +void XMStoreShort2(_Out_ XMSHORT2* pDestination, _In_ FXMVECTOR V); +void XMStoreUShortN2(_Out_ XMUSHORTN2* pDestination, _In_ FXMVECTOR V); +void XMStoreUShort2(_Out_ XMUSHORT2* pDestination, _In_ FXMVECTOR V); +void XMStoreByteN2(_Out_ XMBYTEN2* pDestination, _In_ FXMVECTOR V); +void XMStoreByte2(_Out_ XMBYTE2* pDestination, _In_ FXMVECTOR V); +void XMStoreUByteN2(_Out_ XMUBYTEN2* pDestination, _In_ FXMVECTOR V); +void XMStoreUByte2(_Out_ XMUBYTE2* pDestination, _In_ FXMVECTOR V); + +void XMStoreU565(_Out_ XMU565* pDestination, _In_ FXMVECTOR V); +void XMStoreFloat3PK(_Out_ XMFLOAT3PK* pDestination, _In_ FXMVECTOR V); +void XMStoreFloat3SE(_Out_ XMFLOAT3SE* pDestination, _In_ FXMVECTOR V); + +void XMStoreHalf4(_Out_ XMHALF4* pDestination, _In_ FXMVECTOR V); +void XMStoreShortN4(_Out_ XMSHORTN4* pDestination, _In_ FXMVECTOR V); +void XMStoreShort4(_Out_ XMSHORT4* pDestination, _In_ FXMVECTOR V); +void XMStoreUShortN4(_Out_ XMUSHORTN4* pDestination, _In_ FXMVECTOR V); +void XMStoreUShort4(_Out_ XMUSHORT4* pDestination, _In_ FXMVECTOR V); +void XMStoreXDecN4(_Out_ XMXDECN4* pDestination, _In_ FXMVECTOR V); +void XMStoreXDec4(_Out_ XMXDEC4* pDestination, _In_ FXMVECTOR V); +void XMStoreDecN4(_Out_ XMDECN4* pDestination, _In_ FXMVECTOR V); +void XMStoreDec4(_Out_ XMDEC4* pDestination, _In_ FXMVECTOR V); +void XMStoreUDecN4(_Out_ XMUDECN4* pDestination, _In_ FXMVECTOR V); +void XMStoreUDec4(_Out_ XMUDEC4* pDestination, _In_ FXMVECTOR V); +void XMStoreByteN4(_Out_ XMBYTEN4* pDestination, _In_ FXMVECTOR V); +void XMStoreByte4(_Out_ XMBYTE4* pDestination, _In_ FXMVECTOR V); +void XMStoreUByteN4(_Out_ XMUBYTEN4* pDestination, _In_ FXMVECTOR V); +void XMStoreUByte4(_Out_ XMUBYTE4* pDestination, _In_ FXMVECTOR V); +void XMStoreUNibble4(_Out_ XMUNIBBLE4* pDestination, _In_ FXMVECTOR V); +void XMStoreU555(_Out_ XMU555* pDestination, _In_ FXMVECTOR V); + + +/**************************************************************************** + * + * Implementation + * + ****************************************************************************/ + +#pragma warning(push) +#pragma warning(disable:4068 4214 4204 4365 4616 6001) + +#pragma prefast(push) +#pragma prefast(disable : 25000, "FXMVECTOR is 16 bytes") + +#include "DirectXPackedVector.inl" + +#pragma prefast(pop) +#pragma warning(pop) + +}; // namespace PackedVector + +}; // namespace DirectX + + diff --git a/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXPackedVector.inl b/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXPackedVector.inl new file mode 100644 index 00000000..b4ed1a77 --- /dev/null +++ b/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXPackedVector.inl @@ -0,0 +1,3545 @@ +//------------------------------------------------------------------------------------- +// DirectXPackedVector.inl -- SIMD C++ Math library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------------------------------- + +#ifdef _MSC_VER +#pragma once +#endif + + +/**************************************************************************** + * + * Data conversion + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline float PackedVector::XMConvertHalfToFloat +( + HALF Value +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + uint32_t Mantissa = (uint32_t)(Value & 0x03FF); + + uint32_t Exponent; + if ((Value & 0x7C00) != 0) // The value is normalized + { + Exponent = (uint32_t)((Value >> 10) & 0x1F); + } + else if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x0400) == 0); + + Mantissa &= 0x03FF; + } + else // The value is zero + { + Exponent = (uint32_t)-112; + } + + uint32_t Result = ((Value & 0x8000) << 16) | // Sign + ((Exponent + 112) << 23) | // Exponent + (Mantissa << 13); // Mantissa + + return reinterpret_cast<float*>(&Result)[0]; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline float* PackedVector::XMConvertHalfToFloatStream +( + float* pOutputStream, + size_t OutputStride, + const HALF* pInputStream, + size_t InputStride, + size_t HalfCount +) +{ + assert(pOutputStream); + assert(pInputStream); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) + + const uint8_t* pHalf = reinterpret_cast<const uint8_t*>(pInputStream); + uint8_t* pFloat = reinterpret_cast<uint8_t*>(pOutputStream); + + for (size_t i = 0; i < HalfCount; i++) + { + *reinterpret_cast<float*>(pFloat) = XMConvertHalfToFloat(reinterpret_cast<const HALF*>(pHalf)[0]); + pHalf += InputStride; + pFloat += OutputStride; + } + + return pOutputStream; + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ + +inline PackedVector::HALF PackedVector::XMConvertFloatToHalf +( + float Value +) +{ +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + uint32_t Result; + + uint32_t IValue = reinterpret_cast<uint32_t *>(&Value)[0]; + uint32_t Sign = (IValue & 0x80000000U) >> 16U; + IValue = IValue & 0x7FFFFFFFU; // Hack off the sign + + if (IValue > 0x47FFEFFFU) + { + // The number is too large to be represented as a half. Saturate to infinity. + Result = 0x7FFFU; + } + else + { + if (IValue < 0x38800000U) + { + // The number is too small to be represented as a normalized half. + // Convert it to a denormalized value. + uint32_t Shift = 113U - (IValue >> 23U); + IValue = (0x800000U | (IValue & 0x7FFFFFU)) >> Shift; + } + else + { + // Rebias the exponent to represent the value as a normalized half. + IValue += 0xC8000000U; + } + + Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U)&0x7FFFU; + } + return (HALF)(Result|Sign); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::HALF* PackedVector::XMConvertFloatToHalfStream +( + HALF* pOutputStream, + size_t OutputStride, + const float* pInputStream, + size_t InputStride, + size_t FloatCount +) +{ + assert(pOutputStream); + assert(pInputStream); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) + + const uint8_t* pFloat = reinterpret_cast<const uint8_t*>(pInputStream); + uint8_t* pHalf = reinterpret_cast<uint8_t*>(pOutputStream); + + for (size_t i = 0; i < FloatCount; i++) + { + *reinterpret_cast<HALF*>(pHalf) = XMConvertFloatToHalf(reinterpret_cast<const float*>(pFloat)[0]); + pFloat += InputStride; + pHalf += OutputStride; + } + return pOutputStream; + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +/**************************************************************************** + * + * Vector and matrix load operations + * + ****************************************************************************/ +_Use_decl_annotations_ +inline XMVECTOR PackedVector::XMLoadColor +( + const XMCOLOR* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + // int32_t -> Float conversions are done in one instruction. + // uint32_t -> Float calls a runtime function. Keep in int32_t + int32_t iColor = (int32_t)(pSource->c); + XMVECTORF32 vColor = { + (float)((iColor >> 16) & 0xFF) * (1.0f/255.0f), + (float)((iColor >> 8) & 0xFF) * (1.0f/255.0f), + (float)(iColor & 0xFF) * (1.0f/255.0f), + (float)((iColor >> 24) & 0xFF) * (1.0f/255.0f) + }; + return vColor.v; +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the color in all four entries + __m128i vInt = _mm_set1_epi32(pSource->c); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vInt = _mm_and_si128(vInt,g_XMMaskA8R8G8B8); + // a is unsigned! Flip the bit to convert the order to signed + vInt = _mm_xor_si128(vInt,g_XMFlipA8R8G8B8); + // Convert to floating point numbers + XMVECTOR vTemp = _mm_cvtepi32_ps(vInt); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMFixAA8R8G8B8); + // Convert 0-255 to 0.0f-1.0f + return _mm_mul_ps(vTemp,g_XMNormalizeA8R8G8B8); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR PackedVector::XMLoadHalf2 +( + const XMHALF2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTORF32 vResult = { + XMConvertHalfToFloat(pSource->x), + XMConvertHalfToFloat(pSource->y), + 0.0f, + 0.0f + }; + return vResult.v; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR PackedVector::XMLoadShortN2 +( + const XMSHORTN2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTORF32 vResult = { + (pSource->x == -32768) ? -1.f : ((float)pSource->x * (1.0f/32767.0f)), + (pSource->y == -32768) ? -1.f : ((float)pSource->y * (1.0f/32767.0f)), + 0.0f, + 0.0f + }; + return vResult.v; +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the two shorts in all four entries (WORD alignment okay, + // DWORD alignment preferred) + __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x)); + // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0 + vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16); + // x needs to be sign extended + vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x - 0x8000 to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16); + // Convert -1.0f - 1.0f + vTemp = _mm_mul_ps(vTemp,g_XMNormalizeX16Y16); + // Clamp result (for case of -32768) + return _mm_max_ps( vTemp, g_XMNegativeOne ); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR PackedVector::XMLoadShort2 +( + const XMSHORT2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTORF32 vResult = { + (float)pSource->x, + (float)pSource->y, + 0.f, + 0.f + }; + return vResult.v; +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the two shorts in all four entries (WORD alignment okay, + // DWORD alignment preferred) + __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x)); + // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0 + vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16); + // x needs to be sign extended + vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x - 0x8000 to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16); + // Y is 65536 too large + return _mm_mul_ps(vTemp,g_XMFixupY16); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR PackedVector::XMLoadUShortN2 +( + const XMUSHORTN2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTORF32 vResult = { + (float)pSource->x / 65535.0f, + (float)pSource->y / 65535.0f, + 0.f, + 0.f + }; + return vResult.v; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 FixupY16 = {1.0f/65535.0f,1.0f/(65535.0f*65536.0f),0.0f,0.0f}; + static const XMVECTORF32 FixaddY16 = {0,32768.0f*65536.0f,0,0}; + // Splat the two shorts in all four entries (WORD alignment okay, + // DWORD alignment preferred) + __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x)); + // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0 + vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16); + // y needs to be sign flipped + vTemp = _mm_xor_ps(vTemp,g_XMFlipY); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // y + 0x8000 to undo the signed order. + vTemp = _mm_add_ps(vTemp,FixaddY16); + // Y is 65536 times too large + vTemp = _mm_mul_ps(vTemp,FixupY16); + return vTemp; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR PackedVector::XMLoadUShort2 +( + const XMUSHORT2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTORF32 vResult = { + (float)pSource->x, + (float)pSource->y, + 0.f, + 0.f + }; + return vResult.v; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 FixaddY16 = {0,32768.0f,0,0}; + // Splat the two shorts in all four entries (WORD alignment okay, + // DWORD alignment preferred) + __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x)); + // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0 + vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16); + // y needs to be sign flipped + vTemp = _mm_xor_ps(vTemp,g_XMFlipY); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // Y is 65536 times too large + vTemp = _mm_mul_ps(vTemp,g_XMFixupY16); + // y + 0x8000 to undo the signed order. + vTemp = _mm_add_ps(vTemp,FixaddY16); + return vTemp; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR PackedVector::XMLoadByteN2 +( + const XMBYTEN2* pSource +) +{ + assert(pSource); + XMVECTORF32 vResult = { + (pSource->x == -128) ? -1.f : ((float)pSource->x * (1.0f/127.0f)), + (pSource->y == -128) ? -1.f : ((float)pSource->y * (1.0f/127.0f)), + 0.0f, + 0.0f + }; + return vResult.v; +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR PackedVector::XMLoadByte2 +( + const XMBYTE2* pSource +) +{ + assert(pSource); + XMVECTORF32 vResult = { + (float)pSource->x, + (float)pSource->y, + 0.0f, + 0.0f + }; + return vResult.v; +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR PackedVector::XMLoadUByteN2 +( + const XMUBYTEN2* pSource +) +{ + assert(pSource); + XMVECTORF32 vResult = { + (float)pSource->x * (1.0f/255.0f), + (float)pSource->y * (1.0f/255.0f), + 0.0f, + 0.0f + }; + return vResult.v; +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR PackedVector::XMLoadUByte2 +( + const XMUBYTE2* pSource +) +{ + assert(pSource); + XMVECTORF32 vResult = { + (float)pSource->x, + (float)pSource->y, + 0.0f, + 0.0f + }; + return vResult.v; +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR PackedVector::XMLoadU565 +( + const XMU565* pSource +) +{ + assert(pSource); +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + static const XMVECTORI32 U565And = {0x1F,0x3F<<5,0x1F<<11,0}; + static const XMVECTORF32 U565Mul = {1.0f,1.0f/32.0f,1.0f/2048.f,0}; + // Get the 32 bit value and splat it + XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v)); + // Mask off x, y and z + vResult = _mm_and_ps(vResult,U565And); + // Convert to float + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Normalize x, y, and z + vResult = _mm_mul_ps(vResult,U565Mul); + return vResult; +#else + XMVECTORF32 vResult = { + float(pSource->v & 0x1F), + float((pSource->v >> 5) & 0x3F), + float((pSource->v >> 11) & 0x1F), + 0.f, + }; + return vResult.v; +#endif // !_XM_SSE_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR PackedVector::XMLoadFloat3PK +( + const XMFLOAT3PK* pSource +) +{ + assert(pSource); + + __declspec(align(16)) uint32_t Result[4]; + uint32_t Mantissa; + uint32_t Exponent; + + // X Channel (6-bit mantissa) + Mantissa = pSource->xm; + + if ( pSource->xe == 0x1f ) // INF or NAN + { + Result[0] = 0x7f800000 | (pSource->xm << 17); + } + else + { + if ( pSource->xe != 0 ) // The value is normalized + { + Exponent = pSource->xe; + } + else if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x40) == 0); + + Mantissa &= 0x3F; + } + else // The value is zero + { + Exponent = (uint32_t)-112; + } + + Result[0] = ((Exponent + 112) << 23) | (Mantissa << 17); + } + + // Y Channel (6-bit mantissa) + Mantissa = pSource->ym; + + if ( pSource->ye == 0x1f ) // INF or NAN + { + Result[1] = 0x7f800000 | (pSource->ym << 17); + } + else + { + if ( pSource->ye != 0 ) // The value is normalized + { + Exponent = pSource->ye; + } + else if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x40) == 0); + + Mantissa &= 0x3F; + } + else // The value is zero + { + Exponent = (uint32_t)-112; + } + + Result[1] = ((Exponent + 112) << 23) | (Mantissa << 17); + } + + // Z Channel (5-bit mantissa) + Mantissa = pSource->zm; + + if ( pSource->ze == 0x1f ) // INF or NAN + { + Result[2] = 0x7f800000 | (pSource->zm << 17); + } + else + { + if ( pSource->ze != 0 ) // The value is normalized + { + Exponent = pSource->ze; + } + else if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x20) == 0); + + Mantissa &= 0x1F; + } + else // The value is zero + { + Exponent = (uint32_t)-112; + } + + Result[2] = ((Exponent + 112) << 23) | (Mantissa << 18); + } + + return XMLoadFloat3A( reinterpret_cast<const XMFLOAT3A*>(&Result) ); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR PackedVector::XMLoadFloat3SE +( + const XMFLOAT3SE* pSource +) +{ + assert(pSource); + + __declspec(align(16)) uint32_t Result[4]; + uint32_t Mantissa; + uint32_t Exponent, ExpBits; + + if ( pSource->e == 0x1f ) // INF or NAN + { + Result[0] = 0x7f800000 | (pSource->xm << 14); + Result[1] = 0x7f800000 | (pSource->ym << 14); + Result[2] = 0x7f800000 | (pSource->zm << 14); + } + else if ( pSource->e != 0 ) // The values are all normalized + { + Exponent = pSource->e; + + ExpBits = (Exponent + 112) << 23; + + Mantissa = pSource->xm; + Result[0] = ExpBits | (Mantissa << 14); + + Mantissa = pSource->ym; + Result[1] = ExpBits | (Mantissa << 14); + + Mantissa = pSource->zm; + Result[2] = ExpBits | (Mantissa << 14); + } + else + { + // X Channel + Mantissa = pSource->xm; + + if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x200) == 0); + + Mantissa &= 0x1FF; + } + else // The value is zero + { + Exponent = (uint32_t)-112; + } + + Result[0] = ((Exponent + 112) << 23) | (Mantissa << 14); + + // Y Channel + Mantissa = pSource->ym; + + if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x200) == 0); + + Mantissa &= 0x1FF; + } + else // The value is zero + { + Exponent = (uint32_t)-112; + } + + Result[1] = ((Exponent + 112) << 23) | (Mantissa << 14); + + // Z Channel + Mantissa = pSource->zm; + + if (Mantissa != 0) // The value is denormalized + { + // Normalize the value in the resulting float + Exponent = 1; + + do + { + Exponent--; + Mantissa <<= 1; + } while ((Mantissa & 0x200) == 0); + + Mantissa &= 0x1FF; + } + else // The value is zero + { + Exponent = (uint32_t)-112; + } + + Result[2] = ((Exponent + 112) << 23) | (Mantissa << 14); + } + + return XMLoadFloat3A( reinterpret_cast<const XMFLOAT3A*>(&Result) ); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR PackedVector::XMLoadHalf4 +( + const XMHALF4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTORF32 vResult = { + XMConvertHalfToFloat(pSource->x), + XMConvertHalfToFloat(pSource->y), + XMConvertHalfToFloat(pSource->z), + XMConvertHalfToFloat(pSource->w) + }; + return vResult.v; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR PackedVector::XMLoadShortN4 +( + const XMSHORTN4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + (pSource->x == -32768) ? -1.f : ((float)pSource->x * (1.0f/32767.0f)), + (pSource->y == -32768) ? -1.f : ((float)pSource->y * (1.0f/32767.0f)), + (pSource->z == -32768) ? -1.f : ((float)pSource->z * (1.0f/32767.0f)), + (pSource->w == -32768) ? -1.f : ((float)pSource->w * (1.0f/32767.0f)) + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 vInt = vld1_s16( (const int16_t*)pSource ); + __n128 V = vmovl_s16( vInt ); + V = vcvtq_f32_s32( V ); + const __n128 Scale = vdupq_n_f32( 1.0f/32767.0f ); + V = vmulq_f32( V, Scale ); + return vmaxq_f32( V, g_XMNegativeOne ); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the color in all four entries (x,z,y,w) + __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x)); + // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000 + __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16); + // x and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x and z - 0x8000 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16); + // Convert to -1.0f - 1.0f + vTemp = _mm_mul_ps(vTemp,g_XMNormalizeX16Y16Z16W16); + // Very important! The entries are x,z,y,w, flip it to x,y,z,w + vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0)); + // Clamp result (for case of -32768) + return _mm_max_ps( vTemp, g_XMNegativeOne ); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR PackedVector::XMLoadShort4 +( + const XMSHORT4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + (float)pSource->x, + (float)pSource->y, + (float)pSource->z, + (float)pSource->w + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 vInt = vld1_s16( (const int16_t*)pSource ); + __n128 V = vmovl_s16( vInt ); + return vcvtq_f32_s32( V ); +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the color in all four entries (x,z,y,w) + __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x)); + // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000 + __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16); + // x and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x and z - 0x8000 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16); + // Fix y and w because they are 65536 too large + vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16); + // Very important! The entries are x,z,y,w, flip it to x,y,z,w + return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0)); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR PackedVector::XMLoadUShortN4 +( + const XMUSHORTN4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + (float)pSource->x / 65535.0f, + (float)pSource->y / 65535.0f, + (float)pSource->z / 65535.0f, + (float)pSource->w / 65535.0f + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 vInt = vld1_u16( (const uint16_t*)pSource ); + __n128 V = vmovl_u16( vInt ); + V = vcvtq_f32_u32( V ); + const __n128 Scale = vdupq_n_f32( 1.0f/65535.0f ); + return vmulq_f32( V, Scale ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 FixupY16W16 = {1.0f/65535.0f,1.0f/65535.0f,1.0f/(65535.0f*65536.0f),1.0f/(65535.0f*65536.0f)}; + static const XMVECTORF32 FixaddY16W16 = {0,0,32768.0f*65536.0f,32768.0f*65536.0f}; + // Splat the color in all four entries (x,z,y,w) + __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x)); + // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000 + __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16); + // y and w are signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp,g_XMFlipZW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // y and w + 0x8000 to complete the conversion + vTemp = _mm_add_ps(vTemp,FixaddY16W16); + // Fix y and w because they are 65536 too large + vTemp = _mm_mul_ps(vTemp,FixupY16W16); + // Very important! The entries are x,z,y,w, flip it to x,y,z,w + return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0)); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR PackedVector::XMLoadUShort4 +( + const XMUSHORT4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTORF32 vResult = { + (float)pSource->x, + (float)pSource->y, + (float)pSource->z, + (float)pSource->w + }; + return vResult.v; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 vInt = vld1_u16( (const uint16_t*)pSource ); + __n128 V = vmovl_u16( vInt ); + return vcvtq_f32_u32( V ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 FixaddY16W16 = {0,0,32768.0f,32768.0f}; + // Splat the color in all four entries (x,z,y,w) + __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x)); + // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000 + __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16); + // y and w are signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp,g_XMFlipZW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // Fix y and w because they are 65536 too large + vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16); + // y and w + 0x8000 to complete the conversion + vTemp = _mm_add_ps(vTemp,FixaddY16W16); + // Very important! The entries are x,z,y,w, flip it to x,y,z,w + return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0)); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR PackedVector::XMLoadXDecN4 +( + const XMXDECN4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00}; + + uint32_t ElementX = pSource->v & 0x3FF; + uint32_t ElementY = (pSource->v >> 10) & 0x3FF; + uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; + + XMVECTORF32 vResult = { + (ElementX == 0x200) ? -1.f : ((float)(int16_t)(ElementX | SignExtend[ElementX >> 9]) / 511.0f), + (ElementY == 0x200) ? -1.f : ((float)(int16_t)(ElementY | SignExtend[ElementY >> 9]) / 511.0f), + (ElementZ == 0x200) ? -1.f : ((float)(int16_t)(ElementZ | SignExtend[ElementZ >> 9]) / 511.0f), + (float)(pSource->v >> 30) / 3.0f + }; + return vResult.v; +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the color in all four entries + __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskA2B10G10R10); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMFlipA2B10G10R10); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMFixAA2B10G10R10); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp,g_XMNormalizeA2B10G10R10); + // Clamp result (for case of -512) + return _mm_max_ps( vTemp, g_XMNegativeOne ); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR PackedVector::XMLoadXDec4 +( + const XMXDEC4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00}; + + uint32_t ElementX = pSource->v & 0x3FF; + uint32_t ElementY = (pSource->v >> 10) & 0x3FF; + uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; + + XMVECTORF32 vResult = { + (float)(int16_t)(ElementX | SignExtend[ElementX >> 9]), + (float)(int16_t)(ElementY | SignExtend[ElementY >> 9]), + (float)(int16_t)(ElementZ | SignExtend[ElementZ >> 9]), + (float)(pSource->v >> 30) + }; + return vResult.v; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORI32 XDec4Xor = {0x200, 0x200<<10, 0x200<<20, 0x80000000}; + static const XMVECTORF32 XDec4Add = {-512.0f,-512.0f*1024.0f,-512.0f*1024.0f*1024.0f,32768*65536.0f}; + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskDec4); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp,XDec4Xor); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp,XDec4Add); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp,g_XMMulDec4); + return vTemp; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR PackedVector::XMLoadUDecN4 +( + const XMUDECN4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + uint32_t ElementX = pSource->v & 0x3FF; + uint32_t ElementY = (pSource->v >> 10) & 0x3FF; + uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; + + XMVECTORF32 vResult = { + (float)ElementX / 1023.0f, + (float)ElementY / 1023.0f, + (float)ElementZ / 1023.0f, + (float)(pSource->v >> 30) / 3.0f + }; + return vResult.v; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 UDecN4Mul = {1.0f/1023.0f,1.0f/(1023.0f*1024.0f),1.0f/(1023.0f*1024.0f*1024.0f),1.0f/(3.0f*1024.0f*1024.0f*1024.0f)}; + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskDec4); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMAddUDec4); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp,UDecN4Mul); + return vTemp; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR PackedVector::XMLoadUDec4 +( + const XMUDEC4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + uint32_t ElementX = pSource->v & 0x3FF; + uint32_t ElementY = (pSource->v >> 10) & 0x3FF; + uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; + + XMVECTORF32 vResult = { + (float)ElementX, + (float)ElementY, + (float)ElementZ, + (float)(pSource->v >> 30) + }; + return vResult.v; +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskDec4); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMAddUDec4); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp,g_XMMulDec4); + return vTemp; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR PackedVector::XMLoadDecN4 +( + const XMDECN4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00}; + static const uint32_t SignExtendW[] = {0x00000000, 0xFFFFFFFC}; + + uint32_t ElementX = pSource->v & 0x3FF; + uint32_t ElementY = (pSource->v >> 10) & 0x3FF; + uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; + uint32_t ElementW = pSource->v >> 30; + + XMVECTORF32 vResult = { + (ElementX == 0x200) ? -1.f : ((float)(int16_t)(ElementX | SignExtend[ElementX >> 9]) / 511.0f), + (ElementY == 0x200) ? -1.f : ((float)(int16_t)(ElementY | SignExtend[ElementY >> 9]) / 511.0f), + (ElementZ == 0x200) ? -1.f : ((float)(int16_t)(ElementZ | SignExtend[ElementZ >> 9]) / 511.0f), + (ElementW == 0x2) ? -1.f : ((float)(int16_t)(ElementW | SignExtendW[(ElementW >> 1) & 1])) + }; + return vResult.v; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 DecN4Mul = {1.0f/511.0f,1.0f/(511.0f*1024.0f),1.0f/(511.0f*1024.0f*1024.0f),1.0f/(1024.0f*1024.0f*1024.0f)}; + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskDec4); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMXorDec4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMAddDec4); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp,DecN4Mul); + // Clamp result (for case of -512/-1) + return _mm_max_ps( vTemp, g_XMNegativeOne ); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR PackedVector::XMLoadDec4 +( + const XMDEC4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00}; + static const uint32_t SignExtendW[] = {0x00000000, 0xFFFFFFFC}; + + uint32_t ElementX = pSource->v & 0x3FF; + uint32_t ElementY = (pSource->v >> 10) & 0x3FF; + uint32_t ElementZ = (pSource->v >> 20) & 0x3FF; + uint32_t ElementW = pSource->v >> 30; + + XMVECTORF32 vResult = { + (float)(int16_t)(ElementX | SignExtend[ElementX >> 9]), + (float)(int16_t)(ElementY | SignExtend[ElementY >> 9]), + (float)(int16_t)(ElementZ | SignExtend[ElementZ >> 9]), + (float)(int16_t)(ElementW | SignExtendW[ElementW >> 1]) + }; + return vResult.v; +#elif defined(_XM_SSE_INTRINSICS_) + // Splat the color in all four entries + XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v)); + // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskDec4); + // a is unsigned! Flip the bit to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMXorDec4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // RGB + 0, A + 0x80000000.f to undo the signed order. + vTemp = _mm_add_ps(vTemp,g_XMAddDec4); + // Convert 0-255 to 0.0f-1.0f + vTemp = _mm_mul_ps(vTemp,g_XMMulDec4); + return vTemp; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR PackedVector::XMLoadUByteN4 +( + const XMUBYTEN4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTORF32 vResult = { + (float)pSource->x / 255.0f, + (float)pSource->y / 255.0f, + (float)pSource->z / 255.0f, + (float)pSource->w / 255.0f + }; + return vResult.v; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 LoadUByteN4Mul = {1.0f/255.0f,1.0f/(255.0f*256.0f),1.0f/(255.0f*65536.0f),1.0f/(255.0f*65536.0f*256.0f)}; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x)); + // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskByte4); + // w is signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp,g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // w + 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMAddUDec4); + // Fix y, z and w because they are too large + vTemp = _mm_mul_ps(vTemp,LoadUByteN4Mul); + return vTemp; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR PackedVector::XMLoadUByte4 +( + const XMUBYTE4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTORF32 vResult = { + (float)pSource->x, + (float)pSource->y, + (float)pSource->z, + (float)pSource->w + }; + return vResult.v; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 LoadUByte4Mul = {1.0f,1.0f/256.0f,1.0f/65536.0f,1.0f/(65536.0f*256.0f)}; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x)); + // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskByte4); + // w is signed! Flip the bits to convert the order to unsigned + vTemp = _mm_xor_ps(vTemp,g_XMFlipW); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // w + 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMAddUDec4); + // Fix y, z and w because they are too large + vTemp = _mm_mul_ps(vTemp,LoadUByte4Mul); + return vTemp; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR PackedVector::XMLoadByteN4 +( + const XMBYTEN4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTORF32 vResult = { + (pSource->x == -128) ? -1.f : ((float)pSource->x / 127.0f), + (pSource->y == -128) ? -1.f : ((float)pSource->y / 127.0f), + (pSource->z == -128) ? -1.f : ((float)pSource->z / 127.0f), + (pSource->w == -128) ? -1.f : ((float)pSource->w / 127.0f) + }; + return vResult.v; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 LoadByteN4Mul = {1.0f/127.0f,1.0f/(127.0f*256.0f),1.0f/(127.0f*65536.0f),1.0f/(127.0f*65536.0f*256.0f)}; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x)); + // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskByte4); + // x,y and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMXorByte4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x, y and z - 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMAddByte4); + // Fix y, z and w because they are too large + vTemp = _mm_mul_ps(vTemp,LoadByteN4Mul); + // Clamp result (for case of -128) + return _mm_max_ps( vTemp, g_XMNegativeOne ); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR PackedVector::XMLoadByte4 +( + const XMBYTE4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + XMVECTORF32 vResult = { + (float)pSource->x, + (float)pSource->y, + (float)pSource->z, + (float)pSource->w + }; + return vResult.v; +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 LoadByte4Mul = {1.0f,1.0f/256.0f,1.0f/65536.0f,1.0f/(65536.0f*256.0f)}; + // Splat the color in all four entries (x,z,y,w) + XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x)); + // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000 + vTemp = _mm_and_ps(vTemp,g_XMMaskByte4); + // x,y and z are unsigned! Flip the bits to convert the order to signed + vTemp = _mm_xor_ps(vTemp,g_XMXorByte4); + // Convert to floating point numbers + vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp)); + // x, y and z - 0x80 to complete the conversion + vTemp = _mm_add_ps(vTemp,g_XMAddByte4); + // Fix y, z and w because they are too large + vTemp = _mm_mul_ps(vTemp,LoadByte4Mul); + return vTemp; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR PackedVector::XMLoadUNibble4 +( + const XMUNIBBLE4* pSource +) +{ + assert(pSource); +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + static const XMVECTORI32 UNibble4And = {0xF,0xF0,0xF00,0xF000}; + static const XMVECTORF32 UNibble4Mul = {1.0f,1.0f/16.f,1.0f/256.f,1.0f/4096.f}; + // Get the 32 bit value and splat it + XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v)); + // Mask off x, y and z + vResult = _mm_and_ps(vResult,UNibble4And); + // Convert to float + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Normalize x, y, and z + vResult = _mm_mul_ps(vResult,UNibble4Mul); + return vResult; +#else + XMVECTORF32 vResult = { + float(pSource->v & 0xF), + float((pSource->v >> 4) & 0xF), + float((pSource->v >> 8) & 0xF), + float((pSource->v >> 12) & 0xF) + }; + return vResult.v; +#endif // !_XM_SSE_INTRISICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR PackedVector::XMLoadU555 +( + const XMU555* pSource +) +{ + assert(pSource); +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + static const XMVECTORI32 U555And = {0x1F,0x1F<<5,0x1F<<10,0x8000}; + static const XMVECTORF32 U555Mul = {1.0f,1.0f/32.f,1.0f/1024.f,1.0f/32768.f}; + // Get the 32 bit value and splat it + XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v)); + // Mask off x, y and z + vResult = _mm_and_ps(vResult,U555And); + // Convert to float + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Normalize x, y, and z + vResult = _mm_mul_ps(vResult,U555Mul); + return vResult; +#else + XMVECTORF32 vResult = { + float(pSource->v & 0x1F), + float((pSource->v >> 5) & 0x1F), + float((pSource->v >> 10) & 0x1F), + float((pSource->v >> 15) & 0x1) + }; + return vResult.v; +#endif // !_XM_SSE_INTRISICS_ +} + + +/**************************************************************************** + * + * Vector and matrix store operations + * + ****************************************************************************/ +_Use_decl_annotations_ +inline void PackedVector::XMStoreColor +( + XMCOLOR* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 Scale = {255.0f, 255.0f, 255.0f, 255.0f}; + + XMVECTOR N = XMVectorSaturate(V); + N = XMVectorMultiply(N, Scale.v); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, N ); + + pDestination->c = ((uint32_t)tmp.w << 24) | + ((uint32_t)tmp.x << 16) | + ((uint32_t)tmp.y << 8) | + ((uint32_t)tmp.z); + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = {255.0f,255.0f,255.0f,255.0f}; + // Set <0 to 0 + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + // Set>1 to 1 + vResult = _mm_min_ps(vResult,g_XMOne); + // Convert to 0-255 + vResult = _mm_mul_ps(vResult,Scale); + // Shuffle RGBA to ARGB + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2)); + // Convert to int + __m128i vInt = _mm_cvtps_epi32(vResult); + // Mash to shorts + vInt = _mm_packs_epi32(vInt,vInt); + // Mash to bytes + vInt = _mm_packus_epi16(vInt,vInt); + // Store the color + _mm_store_ss(reinterpret_cast<float *>(&pDestination->c),reinterpret_cast<__m128 *>(&vInt)[0]); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void PackedVector::XMStoreHalf2 +( + XMHALF2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V)); + pDestination->y = XMConvertFloatToHalf(XMVectorGetY(V)); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void PackedVector::XMStoreShortN2 +( + XMSHORTN2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f}; + + XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(N, Scale.v); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, N ); + + pDestination->x = (int16_t)tmp.x; + pDestination->y = (int16_t)tmp.y; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f}; + + XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne); + vResult = _mm_min_ps(vResult,g_XMOne); + vResult = _mm_mul_ps(vResult,Scale); + __m128i vResulti = _mm_cvtps_epi32(vResult); + vResulti = _mm_packs_epi32(vResulti,vResulti); + _mm_store_ss(reinterpret_cast<float *>(&pDestination->x),_mm_castsi128_ps(vResulti)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void PackedVector::XMStoreShort2 +( + XMSHORT2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f}; + static const XMVECTORF32 Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f}; + + XMVECTOR N = XMVectorClamp(V, Min, Max); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, N ); + + pDestination->x = (int16_t)tmp.x; + pDestination->y = (int16_t)tmp.y; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f}; + static const XMVECTORF32 Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f}; + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,Min); + vResult = _mm_min_ps(vResult,Max); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // Pack the ints into shorts + vInt = _mm_packs_epi32(vInt,vInt); + _mm_store_ss(reinterpret_cast<float *>(&pDestination->x),_mm_castsi128_ps(vInt)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void PackedVector::XMStoreUShortN2 +( + XMUSHORTN2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f}; + + XMVECTOR N = XMVectorSaturate(V); + N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v); + N = XMVectorTruncate(N); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, N ); + + pDestination->x = (int16_t)tmp.x; + pDestination->y = (int16_t)tmp.y; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f}; + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,g_XMOne); + vResult = _mm_mul_ps(vResult,Scale); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // Since the SSE pack instruction clamps using signed rules, + // manually extract the values to store them to memory + pDestination->x = static_cast<int16_t>(_mm_extract_epi16(vInt,0)); + pDestination->y = static_cast<int16_t>(_mm_extract_epi16(vInt,2)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void PackedVector::XMStoreUShort2 +( + XMUSHORT2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f}; + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, N ); + + pDestination->x = (int16_t)tmp.x; + pDestination->y = (int16_t)tmp.y; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f}; + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,Max); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // Since the SSE pack instruction clamps using signed rules, + // manually extract the values to store them to memory + pDestination->x = static_cast<int16_t>(_mm_extract_epi16(vInt,0)); + pDestination->y = static_cast<int16_t>(_mm_extract_epi16(vInt,2)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void PackedVector::XMStoreByteN2 +( + XMBYTEN2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + + static const XMVECTORF32 Scale = {127.0f, 127.0f, 127.0f, 127.0f}; + + XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(N, Scale.v); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, N ); + + pDestination->x = (int8_t)tmp.x; + pDestination->y = (int8_t)tmp.y; +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void PackedVector::XMStoreByte2 +( + XMBYTE2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + + static const XMVECTORF32 Min = {-127.0f, -127.0f, -127.0f, -127.0f}; + static const XMVECTORF32 Max = {127.0f, 127.0f, 127.0f, 127.0f}; + + XMVECTOR N = XMVectorClamp(V, Min, Max); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, N ); + + pDestination->x = (int8_t)tmp.x; + pDestination->y = (int8_t)tmp.y; +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void PackedVector::XMStoreUByteN2 +( + XMUBYTEN2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + + static const XMVECTORF32 Scale = {255.0f, 255.0f, 255.0f, 255.0f}; + + XMVECTOR N = XMVectorSaturate(V); + N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v); + N = XMVectorTruncate(N); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, N ); + + pDestination->x = (uint8_t)tmp.x; + pDestination->y = (uint8_t)tmp.y; +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void PackedVector::XMStoreUByte2 +( + XMUBYTE2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + + static const XMVECTORF32 Max = {255.0f, 255.0f, 255.0f, 255.0f}; + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, N ); + + pDestination->x = (uint8_t)tmp.x; + pDestination->y = (uint8_t)tmp.y; +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void PackedVector::XMStoreU565 +( + XMU565* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + static const XMVECTORF32 Max = {31.0f, 63.0f, 31.0f, 0.0f}; + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,Max); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + uint16_t x = static_cast<uint16_t>(_mm_extract_epi16(vInt,0)); + uint16_t y = static_cast<uint16_t>(_mm_extract_epi16(vInt,2)); + uint16_t z = static_cast<uint16_t>(_mm_extract_epi16(vInt,4)); + pDestination->v = ((z & 0x1F) << 11) | + ((y & 0x3F) << 5) | + ((x & 0x1F)); +#else + static const XMVECTORF32 Max = {31.0f, 63.0f, 31.0f, 0.0f}; + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A( &tmp, N ); + + pDestination->v = (((uint16_t)tmp.z & 0x1F) << 11) | + (((uint16_t)tmp.y & 0x3F) << 5) | + (((uint16_t)tmp.x & 0x1F)); +#endif !_XM_SSE_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void PackedVector::XMStoreFloat3PK +( + XMFLOAT3PK* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + + __declspec(align(16)) uint32_t IValue[4]; + XMStoreFloat3A( reinterpret_cast<XMFLOAT3A*>(&IValue), V ); + + uint32_t Result[3]; + + // X & Y Channels (5-bit exponent, 6-bit mantissa) + for(uint32_t j=0; j < 2; ++j) + { + uint32_t Sign = IValue[j] & 0x80000000; + uint32_t I = IValue[j] & 0x7FFFFFFF; + + if ((I & 0x7F800000) == 0x7F800000) + { + // INF or NAN + Result[j] = 0x7c0; + if (( I & 0x7FFFFF ) != 0) + { + Result[j] = 0x7c0 | (((I>>17)|(I>11)|(I>>6)|(I))&0x3f); + } + else if ( Sign ) + { + // -INF is clamped to 0 since 3PK is positive only + Result[j] = 0; + } + } + else if ( Sign ) + { + // 3PK is positive only, so clamp to zero + Result[j] = 0; + } + else if (I > 0x477E0000U) + { + // The number is too large to be represented as a float11, set to max + Result[j] = 0x7BF; + } + else + { + if (I < 0x38800000U) + { + // The number is too small to be represented as a normalized float11 + // Convert it to a denormalized value. + uint32_t Shift = 113U - (I >> 23U); + I = (0x800000U | (I & 0x7FFFFFU)) >> Shift; + } + else + { + // Rebias the exponent to represent the value as a normalized float11 + I += 0xC8000000U; + } + + Result[j] = ((I + 0xFFFFU + ((I >> 17U) & 1U)) >> 17U)&0x7ffU; + } + } + + // Z Channel (5-bit exponent, 5-bit mantissa) + uint32_t Sign = IValue[2] & 0x80000000; + uint32_t I = IValue[2] & 0x7FFFFFFF; + + if ((I & 0x7F800000) == 0x7F800000) + { + // INF or NAN + Result[2] = 0x3e0; + if ( I & 0x7FFFFF ) + { + Result[2] = 0x3e0 | (((I>>18)|(I>13)|(I>>3)|(I))&0x1f); + } + else if ( Sign ) + { + // -INF is clamped to 0 since 3PK is positive only + Result[2] = 0; + } + } + else if ( Sign ) + { + // 3PK is positive only, so clamp to zero + Result[2] = 0; + } + else if (I > 0x477C0000U) + { + // The number is too large to be represented as a float10, set to max + Result[2] = 0x3df; + } + else + { + if (I < 0x38800000U) + { + // The number is too small to be represented as a normalized float10 + // Convert it to a denormalized value. + uint32_t Shift = 113U - (I >> 23U); + I = (0x800000U | (I & 0x7FFFFFU)) >> Shift; + } + else + { + // Rebias the exponent to represent the value as a normalized float10 + I += 0xC8000000U; + } + + Result[2] = ((I + 0x1FFFFU + ((I >> 18U) & 1U)) >> 18U)&0x3ffU; + } + + // Pack Result into memory + pDestination->v = (Result[0] & 0x7ff) + | ( (Result[1] & 0x7ff) << 11 ) + | ( (Result[2] & 0x3ff) << 22 ); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void PackedVector::XMStoreFloat3SE +( + XMFLOAT3SE* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + + __declspec(align(16)) uint32_t IValue[4]; + XMStoreFloat3A( reinterpret_cast<XMFLOAT3A*>(&IValue), V ); + + uint32_t Exp[3]; + uint32_t Frac[3]; + + // X, Y, Z Channels (5-bit exponent, 9-bit mantissa) + for(uint32_t j=0; j < 3; ++j) + { + uint32_t Sign = IValue[j] & 0x80000000; + uint32_t I = IValue[j] & 0x7FFFFFFF; + + if ((I & 0x7F800000) == 0x7F800000) + { + // INF or NAN + Exp[j] = 0x1f; + if (( I & 0x7FFFFF ) != 0) + { + Frac[j] = ((I>>14)|(I>5)|(I))&0x1ff; + } + else if ( Sign ) + { + // -INF is clamped to 0 since 3SE is positive only + Exp[j] = Frac[j] = 0; + } + } + else if ( Sign ) + { + // 3SE is positive only, so clamp to zero + Exp[j] = Frac[j] = 0; + } + else if (I > 0x477FC000U) + { + // The number is too large, set to max + Exp[j] = 0x1e; + Frac[j] = 0x1ff; + } + else + { + if (I < 0x38800000U) + { + // The number is too small to be represented as a normalized float11 + // Convert it to a denormalized value. + uint32_t Shift = 113U - (I >> 23U); + I = (0x800000U | (I & 0x7FFFFFU)) >> Shift; + } + else + { + // Rebias the exponent to represent the value as a normalized float11 + I += 0xC8000000U; + } + + uint32_t T = ((I + 0x1FFFU + ((I >> 14U) & 1U)) >> 14U)&0x3fffU; + + Exp[j] = (T & 0x3E00) >> 9; + Frac[j] = T & 0x1ff; + } + } + + // Adjust to a shared exponent + uint32_t T = XMMax( Exp[0], XMMax( Exp[1], Exp[2] ) ); + + Frac[0] = Frac[0] >> (T - Exp[0]); + Frac[1] = Frac[1] >> (T - Exp[1]); + Frac[2] = Frac[2] >> (T - Exp[2]); + + // Store packed into memory + pDestination->xm = Frac[0]; + pDestination->ym = Frac[1]; + pDestination->zm = Frac[2]; + pDestination->e = T; +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void PackedVector::XMStoreHalf4 +( + XMHALF4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + XMFLOAT4A t; + XMStoreFloat4A(&t, V ); + + pDestination->x = XMConvertFloatToHalf(t.x); + pDestination->y = XMConvertFloatToHalf(t.y); + pDestination->z = XMConvertFloatToHalf(t.z); + pDestination->w = XMConvertFloatToHalf(t.w); + +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void PackedVector::XMStoreShortN4 +( + XMSHORTN4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + static const XMVECTORF32 Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f}; + + XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(N, Scale.v); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->x = (int16_t)tmp.x; + pDestination->y = (int16_t)tmp.y; + pDestination->z = (int16_t)tmp.z; + pDestination->w = (int16_t)tmp.w; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vmaxq_f32( V, g_XMNegativeOne ); + vResult = vminq_f32( vResult, g_XMOne ); + const __n128 Scale = vdupq_n_f32( 32767.0f ); + vResult = vmulq_f32( vResult, Scale ); + vResult = vcvtq_s32_f32( vResult ); + __n64 vInt = vmovn_s32( vResult ); + vst1_s16( (int16_t*)pDestination, vInt ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f}; + + XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne); + vResult = _mm_min_ps(vResult,g_XMOne); + vResult = _mm_mul_ps(vResult,Scale); + __m128i vResulti = _mm_cvtps_epi32(vResult); + vResulti = _mm_packs_epi32(vResulti,vResulti); + _mm_store_sd(reinterpret_cast<double *>(&pDestination->x),_mm_castsi128_pd(vResulti)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void PackedVector::XMStoreShort4 +( + XMSHORT4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + static const XMVECTORF32 Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f}; + static const XMVECTORF32 Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f}; + + XMVECTOR N = XMVectorClamp(V, Min, Max); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->x = (int16_t)tmp.x; + pDestination->y = (int16_t)tmp.y; + pDestination->z = (int16_t)tmp.z; + pDestination->w = (int16_t)tmp.w; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f}; + static const XMVECTORF32 Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f}; + + __n128 vResult = vmaxq_f32( V, Min ); + vResult = vminq_f32( vResult, Max ); + vResult = vcvtq_s32_f32( vResult ); + __n64 vInt = vmovn_s32( vResult ); + vst1_s16( (int16_t*)pDestination, vInt ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f}; + static const XMVECTORF32 Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f}; + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,Min); + vResult = _mm_min_ps(vResult,Max); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // Pack the ints into shorts + vInt = _mm_packs_epi32(vInt,vInt); + _mm_store_sd(reinterpret_cast<double *>(&pDestination->x),_mm_castsi128_pd(vInt)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void PackedVector::XMStoreUShortN4 +( + XMUSHORTN4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + static const XMVECTORF32 Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f}; + + XMVECTOR N = XMVectorSaturate(V); + N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v); + N = XMVectorTruncate(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->x = (int16_t)tmp.x; + pDestination->y = (int16_t)tmp.y; + pDestination->z = (int16_t)tmp.z; + pDestination->w = (int16_t)tmp.w; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vmaxq_f32( V, g_XMZero ); + vResult = vminq_f32( vResult, g_XMOne ); + const __n128 Scale = vdupq_n_f32( 65535.0f ); + vResult = vmulq_f32( vResult, Scale ); + vResult = vcvtq_u32_f32( vResult ); + __n64 vInt = vmovn_u32( vResult ); + vst1_u16( (uint16_t*)pDestination, vInt ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f}; + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,g_XMOne); + vResult = _mm_mul_ps(vResult,Scale); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // Since the SSE pack instruction clamps using signed rules, + // manually extract the values to store them to memory + pDestination->x = static_cast<int16_t>(_mm_extract_epi16(vInt,0)); + pDestination->y = static_cast<int16_t>(_mm_extract_epi16(vInt,2)); + pDestination->z = static_cast<int16_t>(_mm_extract_epi16(vInt,4)); + pDestination->w = static_cast<int16_t>(_mm_extract_epi16(vInt,6)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void PackedVector::XMStoreUShort4 +( + XMUSHORT4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + static const XMVECTORF32 Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f}; + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->x = (int16_t)tmp.x; + pDestination->y = (int16_t)tmp.y; + pDestination->z = (int16_t)tmp.z; + pDestination->w = (int16_t)tmp.w; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + static const XMVECTORF32 Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f}; + + __n128 vResult = vmaxq_f32( V, g_XMZero ); + vResult = vminq_f32( vResult, Max ); + vResult = vcvtq_u32_f32( vResult ); + __n64 vInt = vmovn_u32( vResult ); + vst1_u16( (uint16_t*)pDestination, vInt ); +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f}; + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,Max); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // Since the SSE pack instruction clamps using signed rules, + // manually extract the values to store them to memory + pDestination->x = static_cast<int16_t>(_mm_extract_epi16(vInt,0)); + pDestination->y = static_cast<int16_t>(_mm_extract_epi16(vInt,2)); + pDestination->z = static_cast<int16_t>(_mm_extract_epi16(vInt,4)); + pDestination->w = static_cast<int16_t>(_mm_extract_epi16(vInt,6)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void PackedVector::XMStoreXDecN4 +( + XMXDECN4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 Min = {-1.0f, -1.0f, -1.0f, 0.0f}; + static const XMVECTORF32 Scale = {511.0f, 511.0f, 511.0f, 3.0f}; + + XMVECTOR N = XMVectorClamp(V, Min.v, g_XMOne.v); + N = XMVectorMultiply(N, Scale.v); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->v = ((uint32_t)tmp.w << 30) | + (((int32_t)tmp.z & 0x3FF) << 20) | + (((int32_t)tmp.y & 0x3FF) << 10) | + (((int32_t)tmp.x & 0x3FF)); + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 Min = {-1.0f, -1.0f, -1.0f, 0.0f}; + static const XMVECTORF32 Scale = {511.0f, 511.0f*1024.0f, 511.0f*1048576.0f,3.0f*536870912.0f}; + static const XMVECTORI32 ScaleMask = {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<29}; + XMVECTOR vResult = _mm_max_ps(V,Min); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,Scale); + // Convert to int (W is unsigned) + __m128i vResulti = _mm_cvtps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,ScaleMask); + // To fix W, add itself to shift it up to <<30 instead of <<29 + __m128i vResultw = _mm_and_si128(vResulti,g_XMMaskW); + vResulti = _mm_add_epi32(vResulti,vResultw); + // Do a horizontal or of all 4 entries + vResult = XM_PERMUTE_PS(_mm_castsi128_ps(vResulti),_MM_SHUFFLE(0,3,2,1)); + vResulti = _mm_or_si128(vResulti,_mm_castps_si128(vResult)); + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,3,2,1)); + vResulti = _mm_or_si128(vResulti,_mm_castps_si128(vResult)); + vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,3,2,1)); + vResulti = _mm_or_si128(vResulti,_mm_castps_si128(vResult)); + _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void PackedVector::XMStoreXDec4 +( + XMXDEC4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 Min = {-511.0f, -511.0f, -511.0f, 0.0f}; + static const XMVECTORF32 Max = {511.0f, 511.0f, 511.0f, 3.0f}; + + XMVECTOR N = XMVectorClamp(V, Min, Max); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->v = ((uint32_t)tmp.w << 30) | + (((int32_t)tmp.z & 0x3FF) << 20) | + (((int32_t)tmp.y & 0x3FF) << 10) | + (((int32_t)tmp.x & 0x3FF)); + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 MinXDec4 = {-511.0f,-511.0f,-511.0f, 0.0f}; + static const XMVECTORF32 MaxXDec4 = { 511.0f, 511.0f, 511.0f, 3.0f}; + static const XMVECTORF32 ScaleXDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f}; + static const XMVECTORI32 MaskXDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,MinXDec4); + vResult = _mm_min_ps(vResult,MaxXDec4); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleXDec4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskXDec4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // Perform a single bit left shift on y|w + vResulti2 = _mm_add_epi32(vResulti2,vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void PackedVector::XMStoreUDecN4 +( + XMUDECN4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 Scale = {1023.0f, 1023.0f, 1023.0f, 3.0f}; + + XMVECTOR N = XMVectorSaturate(V); + N = XMVectorMultiply(N, Scale.v); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->v = ((uint32_t)tmp.w << 30) | + (((uint32_t)tmp.z & 0x3FF) << 20) | + (((uint32_t)tmp.y & 0x3FF) << 10) | + (((uint32_t)tmp.x & 0x3FF)); + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleUDecN4 = {1023.0f,1023.0f*1024.0f*0.5f,1023.0f*1024.0f*1024.0f,3.0f*1024.0f*1024.0f*1024.0f*0.5f}; + static const XMVECTORI32 MaskUDecN4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleUDecN4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskUDecN4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // Perform a left shift by one bit on y|w + vResulti2 = _mm_add_epi32(vResulti2,vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void PackedVector::XMStoreUDec4 +( + XMUDEC4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 Max = {1023.0f, 1023.0f, 1023.0f, 3.0f}; + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->v = ((uint32_t)tmp.w << 30) | + (((uint32_t)tmp.z & 0x3FF) << 20) | + (((uint32_t)tmp.y & 0x3FF) << 10) | + (((uint32_t)tmp.x & 0x3FF)); + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 MaxUDec4 = { 1023.0f, 1023.0f, 1023.0f, 3.0f}; + static const XMVECTORF32 ScaleUDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f}; + static const XMVECTORI32 MaskUDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,MaxUDec4); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleUDec4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskUDec4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // Perform a left shift by one bit on y|w + vResulti2 = _mm_add_epi32(vResulti2,vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void PackedVector::XMStoreDecN4 +( + XMDECN4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 Scale = {511.0f, 511.0f, 511.0f, 1.0f}; + + XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(N, Scale.v); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->v = ((int32_t)tmp.w << 30) | + (((int32_t)tmp.z & 0x3FF) << 20) | + (((int32_t)tmp.y & 0x3FF) << 10) | + (((int32_t)tmp.x & 0x3FF)); + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleDecN4 = {511.0f,511.0f*1024.0f,511.0f*1024.0f*1024.0f,1.0f*1024.0f*1024.0f*1024.0f}; + static const XMVECTORI32 MaskDecN4= {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<30}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleDecN4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskDecN4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void PackedVector::XMStoreDec4 +( + XMDEC4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 Min = {-511.0f, -511.0f, -511.0f, -1.0f}; + static const XMVECTORF32 Max = {511.0f, 511.0f, 511.0f, 1.0f}; + + XMVECTOR N = XMVectorClamp(V, Min, Max); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->v = ((int32_t)tmp.w << 30) | + (((int32_t)tmp.z & 0x3FF) << 20) | + (((int32_t)tmp.y & 0x3FF) << 10) | + (((int32_t)tmp.x & 0x3FF)); + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 MinDec4 = {-511.0f,-511.0f,-511.0f,-1.0f}; + static const XMVECTORF32 MaxDec4 = { 511.0f, 511.0f, 511.0f, 1.0f}; + static const XMVECTORF32 ScaleDec4 = {1.0f,1024.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f}; + static const XMVECTORI32 MaskDec4= {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<30}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,MinDec4); + vResult = _mm_min_ps(vResult,MaxDec4); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleDec4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskDec4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void PackedVector::XMStoreUByteN4 +( + XMUBYTEN4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 Scale = {255.0f, 255.0f, 255.0f, 255.0f}; + + XMVECTOR N = XMVectorSaturate(V); + N = XMVectorMultiply(N, Scale.v); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->x = (uint8_t)tmp.x; + pDestination->y = (uint8_t)tmp.y; + pDestination->z = (uint8_t)tmp.z; + pDestination->w = (uint8_t)tmp.w; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleUByteN4 = {255.0f,255.0f*256.0f*0.5f,255.0f*256.0f*256.0f,255.0f*256.0f*256.0f*256.0f*0.5f}; + static const XMVECTORI32 MaskUByteN4 = {0xFF,0xFF<<(8-1),0xFF<<16,0xFF<<(24-1)}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleUByteN4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskUByteN4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // Perform a single bit left shift to fix y|w + vResulti2 = _mm_add_epi32(vResulti2,vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void PackedVector::XMStoreUByte4 +( + XMUBYTE4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 Max = {255.0f, 255.0f, 255.0f, 255.0f}; + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->x = (uint8_t)tmp.x; + pDestination->y = (uint8_t)tmp.y; + pDestination->z = (uint8_t)tmp.z; + pDestination->w = (uint8_t)tmp.w; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 MaxUByte4 = { 255.0f, 255.0f, 255.0f, 255.0f}; + static const XMVECTORF32 ScaleUByte4 = {1.0f,256.0f*0.5f,256.0f*256.0f,256.0f*256.0f*256.0f*0.5f}; + static const XMVECTORI32 MaskUByte4 = {0xFF,0xFF<<(8-1),0xFF<<16,0xFF<<(24-1)}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,MaxUByte4); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleUByte4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskUByte4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // Perform a single bit left shift to fix y|w + vResulti2 = _mm_add_epi32(vResulti2,vResulti2); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void PackedVector::XMStoreByteN4 +( + XMBYTEN4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 Scale = {127.0f, 127.0f, 127.0f, 127.0f}; + + XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v); + N = XMVectorMultiply(V, Scale.v); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->x = (int8_t)tmp.x; + pDestination->y = (int8_t)tmp.y; + pDestination->z = (int8_t)tmp.z; + pDestination->w = (int8_t)tmp.w; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 ScaleByteN4 = {127.0f,127.0f*256.0f,127.0f*256.0f*256.0f,127.0f*256.0f*256.0f*256.0f}; + static const XMVECTORI32 MaskByteN4 = {0xFF,0xFF<<8,0xFF<<16,0xFF<<24}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne); + vResult = _mm_min_ps(vResult,g_XMOne); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleByteN4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskByteN4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void PackedVector::XMStoreByte4 +( + XMBYTE4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 Min = {-127.0f, -127.0f, -127.0f, -127.0f}; + static const XMVECTORF32 Max = {127.0f, 127.0f, 127.0f, 127.0f}; + + XMVECTOR N = XMVectorClamp(V, Min, Max); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->x = (int8_t)tmp.x; + pDestination->y = (int8_t)tmp.y; + pDestination->z = (int8_t)tmp.z; + pDestination->w = (int8_t)tmp.w; + +#elif defined(_XM_SSE_INTRINSICS_) + static const XMVECTORF32 MinByte4 = {-127.0f,-127.0f,-127.0f,-127.0f}; + static const XMVECTORF32 MaxByte4 = { 127.0f, 127.0f, 127.0f, 127.0f}; + static const XMVECTORF32 ScaleByte4 = {1.0f,256.0f,256.0f*256.0f,256.0f*256.0f*256.0f}; + static const XMVECTORI32 MaskByte4 = {0xFF,0xFF<<8,0xFF<<16,0xFF<<24}; + // Clamp to bounds + XMVECTOR vResult = _mm_max_ps(V,MinByte4); + vResult = _mm_min_ps(vResult,MaxByte4); + // Scale by multiplication + vResult = _mm_mul_ps(vResult,ScaleByte4); + // Convert to int + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Mask off any fraction + vResulti = _mm_and_si128(vResulti,MaskByte4); + // Do a horizontal or of 4 entries + __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2)); + // x = x|z, y = y|w + vResulti = _mm_or_si128(vResulti,vResulti2); + // Move Z to the x position + vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1)); + // i = x|y|z|w + vResulti = _mm_or_si128(vResulti,vResulti2); + _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti)); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void PackedVector::XMStoreUNibble4 +( + XMUNIBBLE4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + static const XMVECTORF32 Max = {15.0f,15.0f,15.0f,15.0f}; + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,Max); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + uint16_t x = static_cast<uint16_t>(_mm_extract_epi16(vInt,0)); + uint16_t y = static_cast<uint16_t>(_mm_extract_epi16(vInt,2)); + uint16_t z = static_cast<uint16_t>(_mm_extract_epi16(vInt,4)); + uint16_t w = static_cast<uint16_t>(_mm_extract_epi16(vInt,6)); + pDestination->v = ((w & 0xF) << 12) | + ((z & 0xF) << 8) | + ((y & 0xF) << 4) | + ((x & 0xF)); +#else + static const XMVECTORF32 Max = {15.0f,15.0f,15.0f,15.0f}; + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->v = (((uint16_t)tmp.w & 0xF) << 12) | + (((uint16_t)tmp.z & 0xF) << 8) | + (((uint16_t)tmp.y & 0xF) << 4) | + (((uint16_t)tmp.x & 0xF)); +#endif !_XM_SSE_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void PackedVector::XMStoreU555 +( + XMU555* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) + static const XMVECTORF32 Max = {31.0f, 31.0f, 31.0f, 1.0f}; + // Bounds check + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + vResult = _mm_min_ps(vResult,Max); + // Convert to int with rounding + __m128i vInt = _mm_cvtps_epi32(vResult); + // No SSE operations will write to 16-bit values, so we have to extract them manually + uint16_t x = static_cast<uint16_t>(_mm_extract_epi16(vInt,0)); + uint16_t y = static_cast<uint16_t>(_mm_extract_epi16(vInt,2)); + uint16_t z = static_cast<uint16_t>(_mm_extract_epi16(vInt,4)); + uint16_t w = static_cast<uint16_t>(_mm_extract_epi16(vInt,6)); + pDestination->v = ((w) ? 0x8000 : 0) | + ((z & 0x1F) << 10) | + ((y & 0x1F) << 5) | + ((x & 0x1F)); +#else + static const XMVECTORF32 Max = {31.0f, 31.0f, 31.0f, 1.0f}; + + XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v); + N = XMVectorRound(N); + + XMFLOAT4A tmp; + XMStoreFloat4A(&tmp, N ); + + pDestination->v = ((tmp.w > 0.f) ? 0x8000 : 0) | + (((uint16_t)tmp.z & 0x1F) << 10) | + (((uint16_t)tmp.y & 0x1F) << 5) | + (((uint16_t)tmp.x & 0x1F)); +#endif !_XM_SSE_INTRINSICS_ +} + + +/**************************************************************************** + * + * XMCOLOR operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMCOLOR::XMCOLOR +( + float _r, + float _g, + float _b, + float _a +) +{ + XMStoreColor(this, XMVectorSet(_r, _g, _b, _a)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMCOLOR::XMCOLOR +( + const float* pArray +) +{ + XMStoreColor(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray))); +} + +/**************************************************************************** + * + * XMHALF2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMHALF2::XMHALF2 +( + float _x, + float _y +) +{ + x = XMConvertFloatToHalf(_x); + y = XMConvertFloatToHalf(_y); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMHALF2::XMHALF2 +( + const float* pArray +) +{ + assert( pArray != nullptr ); + x = XMConvertFloatToHalf(pArray[0]); + y = XMConvertFloatToHalf(pArray[1]); +} + +/**************************************************************************** + * + * XMSHORTN2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMSHORTN2::XMSHORTN2 +( + float _x, + float _y +) +{ + XMStoreShortN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMSHORTN2::XMSHORTN2 +( + const float* pArray +) +{ + XMStoreShortN2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray))); +} + +/**************************************************************************** + * + * XMSHORT2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMSHORT2::XMSHORT2 +( + float _x, + float _y +) +{ + XMStoreShort2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMSHORT2::XMSHORT2 +( + const float* pArray +) +{ + XMStoreShort2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray))); +} + +/**************************************************************************** + * + * XMUSHORTN2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMUSHORTN2::XMUSHORTN2 +( + float _x, + float _y +) +{ + XMStoreUShortN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMUSHORTN2::XMUSHORTN2 +( + const float* pArray +) +{ + XMStoreUShortN2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray))); +} + +/**************************************************************************** + * + * XMUSHORT2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMUSHORT2::XMUSHORT2 +( + float _x, + float _y +) +{ + XMStoreUShort2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMUSHORT2::XMUSHORT2 +( + const float* pArray +) +{ + XMStoreUShort2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray))); +} + +/**************************************************************************** + * + * XMBYTEN2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMBYTEN2::XMBYTEN2 +( + float _x, + float _y +) +{ + XMStoreByteN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMBYTEN2::XMBYTEN2 +( + const float* pArray +) +{ + XMStoreByteN2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray))); +} + +/**************************************************************************** + * + * XMBYTE2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMBYTE2::XMBYTE2 +( + float _x, + float _y +) +{ + XMStoreByte2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMBYTE2::XMBYTE2 +( + const float* pArray +) +{ + XMStoreByte2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray))); +} + +/**************************************************************************** + * + * XMUBYTEN2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMUBYTEN2::XMUBYTEN2 +( + float _x, + float _y +) +{ + XMStoreUByteN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMUBYTEN2::XMUBYTEN2 +( + const float* pArray +) +{ + XMStoreUByteN2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray))); +} + +/**************************************************************************** + * + * XMUBYTE2 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMUBYTE2::XMUBYTE2 +( + float _x, + float _y +) +{ + XMStoreUByte2(this, XMVectorSet(_x, _y, 0.0f, 0.0f)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMUBYTE2::XMUBYTE2 +( + const float* pArray +) +{ + XMStoreUByte2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray))); +} + +/**************************************************************************** + * + * XMU565 operators + * + ****************************************************************************/ + +inline PackedVector::XMU565::XMU565 +( + float _x, + float _y, + float _z +) +{ + XMStoreU565(this, XMVectorSet( _x, _y, _z, 0.0f )); +} + +_Use_decl_annotations_ +inline PackedVector::XMU565::XMU565 +( + const float *pArray +) +{ + XMStoreU565(this, XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pArray))); +} + +/**************************************************************************** + * + * XMFLOAT3PK operators + * + ****************************************************************************/ + +inline PackedVector::XMFLOAT3PK::XMFLOAT3PK +( + float _x, + float _y, + float _z +) +{ + XMStoreFloat3PK(this, XMVectorSet( _x, _y, _z, 0.0f )); +} + +_Use_decl_annotations_ +inline PackedVector::XMFLOAT3PK::XMFLOAT3PK +( + const float *pArray +) +{ + XMStoreFloat3PK(this, XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pArray))); +} + +/**************************************************************************** + * + * XMFLOAT3SE operators + * + ****************************************************************************/ + +inline PackedVector::XMFLOAT3SE::XMFLOAT3SE +( + float _x, + float _y, + float _z +) +{ + XMStoreFloat3SE(this, XMVectorSet( _x, _y, _z, 0.0f )); +} + +_Use_decl_annotations_ +inline PackedVector::XMFLOAT3SE::XMFLOAT3SE +( + const float *pArray +) +{ + XMStoreFloat3SE(this, XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pArray))); +} + +/**************************************************************************** + * + * XMHALF4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMHALF4::XMHALF4 +( + float _x, + float _y, + float _z, + float _w +) +{ + x = XMConvertFloatToHalf(_x); + y = XMConvertFloatToHalf(_y); + z = XMConvertFloatToHalf(_z); + w = XMConvertFloatToHalf(_w); +} + +//------------------------------------------------------------------------------ + +_Use_decl_annotations_ +inline PackedVector::XMHALF4::XMHALF4 +( + const float* pArray +) +{ + XMConvertFloatToHalfStream(&x, sizeof(HALF), pArray, sizeof(float), 4); +} + +/**************************************************************************** + * + * XMSHORTN4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMSHORTN4::XMSHORTN4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreShortN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMSHORTN4::XMSHORTN4 +( + const float* pArray +) +{ + XMStoreShortN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray))); +} + +/**************************************************************************** + * + * XMSHORT4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMSHORT4::XMSHORT4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreShort4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMSHORT4::XMSHORT4 +( + const float* pArray +) +{ + XMStoreShort4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray))); +} + +/**************************************************************************** + * + * XMUSHORTN4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMUSHORTN4::XMUSHORTN4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreUShortN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMUSHORTN4::XMUSHORTN4 +( + const float* pArray +) +{ + XMStoreUShortN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray))); +} + +/**************************************************************************** + * + * XMUSHORT4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMUSHORT4::XMUSHORT4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreUShort4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMUSHORT4::XMUSHORT4 +( + const float* pArray +) +{ + XMStoreUShort4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray))); +} + +/**************************************************************************** + * + * XMXDECN4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMXDECN4::XMXDECN4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreXDecN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMXDECN4::XMXDECN4 +( + const float* pArray +) +{ + XMStoreXDecN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray))); +} + +/**************************************************************************** + * + * XMXDEC4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMXDEC4::XMXDEC4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreXDec4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMXDEC4::XMXDEC4 +( + const float* pArray +) +{ + XMStoreXDec4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray))); +} + +/**************************************************************************** + * + * XMDECN4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMDECN4::XMDECN4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreDecN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMDECN4::XMDECN4 +( + const float* pArray +) +{ + XMStoreDecN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray))); +} + +/**************************************************************************** + * + * XMDEC4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMDEC4::XMDEC4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreDec4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMDEC4::XMDEC4 +( + const float* pArray +) +{ + XMStoreDec4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray))); +} + +/**************************************************************************** + * + * XMUDECN4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMUDECN4::XMUDECN4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreUDecN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMUDECN4::XMUDECN4 +( + const float* pArray +) +{ + XMStoreUDecN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray))); +} + +/**************************************************************************** + * + * XMUDEC4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMUDEC4::XMUDEC4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreUDec4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMUDEC4::XMUDEC4 +( + const float* pArray +) +{ + XMStoreUDec4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray))); +} + +/**************************************************************************** + * + * XMBYTEN4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMBYTEN4::XMBYTEN4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreByteN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMBYTEN4::XMBYTEN4 +( + const float* pArray +) +{ + XMStoreByteN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray))); +} + +/**************************************************************************** + * + * XMBYTE4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMBYTE4::XMBYTE4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreByte4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMBYTE4::XMBYTE4 +( + const float* pArray +) +{ + XMStoreByte4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray))); +} + +/**************************************************************************** + * + * XMUBYTEN4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMUBYTEN4::XMUBYTEN4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreUByteN4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMUBYTEN4::XMUBYTEN4 +( + const float* pArray +) +{ + XMStoreUByteN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray))); +} + +/**************************************************************************** + * + * XMUBYTE4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMUBYTE4::XMUBYTE4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreUByte4(this, XMVectorSet(_x, _y, _z, _w)); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMUBYTE4::XMUBYTE4 +( + const float* pArray +) +{ + XMStoreUByte4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray))); +} + +/**************************************************************************** + * + * XMUNIBBLE4 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMUNIBBLE4::XMUNIBBLE4 +( + float _x, + float _y, + float _z, + float _w +) +{ + XMStoreUNibble4(this, XMVectorSet( _x, _y, _z, _w )); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMUNIBBLE4::XMUNIBBLE4 +( + const float *pArray +) +{ + XMStoreUNibble4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray))); +} + +/**************************************************************************** + * + * XMU555 operators + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +inline PackedVector::XMU555::XMU555 +( + float _x, + float _y, + float _z, + bool _w +) +{ + XMStoreU555(this, XMVectorSet(_x, _y, _z, ((_w) ? 1.0f : 0.0f) )); +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline PackedVector::XMU555::XMU555 +( + const float *pArray, + bool _w +) +{ + XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pArray)); + XMStoreU555(this, XMVectorSetW(V, ((_w) ? 1.0f : 0.0f) )); +} + + diff --git a/Minecraft.Client/PS3/PS3Extras/DirectX/no_sal2.h b/Minecraft.Client/PS3/PS3Extras/DirectX/no_sal2.h new file mode 100644 index 00000000..b66b68cd --- /dev/null +++ b/Minecraft.Client/PS3/PS3Extras/DirectX/no_sal2.h @@ -0,0 +1,1022 @@ + +/*** +* no_sal2.h - renders the SAL annotations for documenting APIs harmless. +* +* Copyright (c) Microsoft Corporation. All rights reserved. +* +*Purpose: +* sal.h provides a set of SAL2 annotations to describe how a function uses its +* parameters - the assumptions it makes about them, and the guarantees it makes +* upon finishing. This file redefines all those annotation macros to be harmless. +* It is designed for use in down-level build environments where the tooling may +* be unhappy with the standard SAL2 macro definitions. +* +* [Public] +* +****/ + +#ifndef _NO_SAL_2_H_ +#define _NO_SAL_2_H_ + +#ifdef _When_ +#undef _When_ +#endif +#define _When_(c,a) +#ifdef _At_ +#undef _At_ +#endif +#define _At_(t,a) +#ifdef _At_buffer_ +#undef _At_buffer_ +#endif +#define _At_buffer_(t,i,c,a) +#ifdef _Group_ +#undef _Group_ +#endif +#define _Group_(a) +#ifdef _Pre_ +#undef _Pre_ +#endif +#define _Pre_ +#ifdef _Post_ +#undef _Post_ +#endif +#define _Post_ +#ifdef _Deref_ +#undef _Deref_ +#endif +#define _Deref_ +#ifdef _Null_ +#undef _Null_ +#endif +#define _Null_ +#ifdef _Notnull_ +#undef _Notnull_ +#endif +#define _Notnull_ +#ifdef _Maybenull_ +#undef _Maybenull_ +#endif +#define _Maybenull_ +#ifdef _Const_ +#undef _Const_ +#endif +#define _Const_ +#ifdef _Check_return_ +#undef _Check_return_ +#endif +#define _Check_return_ +#ifdef _Must_inspect_result_ +#undef _Must_inspect_result_ +#endif +#define _Must_inspect_result_ +#ifdef _Pre_satisfies_ +#undef _Pre_satisfies_ +#endif +#define _Pre_satisfies_(e) +#ifdef _Post_satisfies_ +#undef _Post_satisfies_ +#endif +#define _Post_satisfies_(e) +#ifdef _Writable_elements_ +#undef _Writable_elements_ +#endif +#define _Writable_elements_(s) +#ifdef _Writable_bytes_ +#undef _Writable_bytes_ +#endif +#define _Writable_bytes_(s) +#ifdef _Readable_elements_ +#undef _Readable_elements_ +#endif +#define _Readable_elements_(s) +#ifdef _Readable_bytes_ +#undef _Readable_bytes_ +#endif +#define _Readable_bytes_(s) +#ifdef _Null_terminated_ +#undef _Null_terminated_ +#endif +#define _Null_terminated_ +#ifdef _NullNull_terminated_ +#undef _NullNull_terminated_ +#endif +#define _NullNull_terminated_ +#ifdef _Valid_ +#undef _Valid_ +#endif +#define _Valid_ +#ifdef _Notvalid_ +#undef _Notvalid_ +#endif +#define _Notvalid_ +#ifdef _Success_ +#undef _Success_ +#endif +#define _Success_(c) +#ifdef _Return_type_success_ +#undef _Return_type_success_ +#endif +#define _Return_type_success_(c) +#ifdef _On_failure_ +#undef _On_failure_ +#endif +#define _On_failure_(a) +#ifdef _Always_ +#undef _Always_ +#endif +#define _Always_(a) +#ifdef _Use_decl_annotations_ +#undef _Use_decl_annotations_ +#endif +#define _Use_decl_annotations_ +#ifdef _Pre_defensive_ +#undef _Pre_defensive_ +#endif +#define _Pre_defensive_ +#ifdef _Post_defensive_ +#undef _Post_defensive_ +#endif +#define _Post_defensive_ +#ifdef _Pre_unknown_ +#undef _Pre_unknown_ +#endif +#define _Pre_unknown_ +#ifdef _Acquires_lock_ +#undef _Acquires_lock_ +#endif +#define _Acquires_lock_(e) +#ifdef _Releases_lock_ +#undef _Releases_lock_ +#endif +#define _Releases_lock_(e) +#ifdef _Requires_lock_held_ +#undef _Requires_lock_held_ +#endif +#define _Requires_lock_held_(e) +#ifdef _Requires_lock_not_held_ +#undef _Requires_lock_not_held_ +#endif +#define _Requires_lock_not_held_(e) +#ifdef _Requires_no_locks_held_ +#undef _Requires_no_locks_held_ +#endif +#define _Requires_no_locks_held_ +#ifdef _Guarded_by_ +#undef _Guarded_by_ +#endif +#define _Guarded_by_(e) +#ifdef _Write_guarded_by_ +#undef _Write_guarded_by_ +#endif +#define _Write_guarded_by_(e) +#ifdef _Interlocked_ +#undef _Interlocked_ +#endif +#define _Interlocked_ +#ifdef _Post_same_lock_ +#undef _Post_same_lock_ +#endif +#define _Post_same_lock_(e1,e2) +#ifdef _Benign_race_begin_ +#undef _Benign_race_begin_ +#endif +#define _Benign_race_begin_ +#ifdef _Benign_race_end_ +#undef _Benign_race_end_ +#endif +#define _Benign_race_end_ +#ifdef _No_competing_thread_ +#undef _No_competing_thread_ +#endif +#define _No_competing_thread_ +#ifdef _No_competing_thread_begin_ +#undef _No_competing_thread_begin_ +#endif +#define _No_competing_thread_begin_ +#ifdef _No_competing_thread_end_ +#undef _No_competing_thread_end_ +#endif +#define _No_competing_thread_end_ +#ifdef _Acquires_shared_lock_ +#undef _Acquires_shared_lock_ +#endif +#define _Acquires_shared_lock_(e) +#ifdef _Releases_shared_lock_ +#undef _Releases_shared_lock_ +#endif +#define _Releases_shared_lock_(e) +#ifdef _Requires_shared_lock_held_ +#undef _Requires_shared_lock_held_ +#endif +#define _Requires_shared_lock_held_(e) +#ifdef _Acquires_exclusive_lock_ +#undef _Acquires_exclusive_lock_ +#endif +#define _Acquires_exclusive_lock_(e) +#ifdef _Releases_exclusive_lock_ +#undef _Releases_exclusive_lock_ +#endif +#define _Releases_exclusive_lock_(e) +#ifdef _Requires_exclusive_lock_held_ +#undef _Requires_exclusive_lock_held_ +#endif +#define _Requires_exclusive_lock_held_(e) +#ifdef _Has_lock_kind_ +#undef _Has_lock_kind_ +#endif +#define _Has_lock_kind_(n) +#ifdef _Create_lock_level_ +#undef _Create_lock_level_ +#endif +#define _Create_lock_level_(n) +#ifdef _Has_lock_level_ +#undef _Has_lock_level_ +#endif +#define _Has_lock_level_(n) +#ifdef _Lock_level_order_ +#undef _Lock_level_order_ +#endif +#define _Lock_level_order_(n1,n2) +#ifdef _Analysis_assume_lock_acquired_ +#undef _Analysis_assume_lock_acquired_ +#endif +#define _Analysis_assume_lock_acquired_(e) +#ifdef _Analysis_assume_lock_released_ +#undef _Analysis_assume_lock_released_ +#endif +#define _Analysis_assume_lock_released_(e) +#ifdef _Analysis_assume_lock_held_ +#undef _Analysis_assume_lock_held_ +#endif +#define _Analysis_assume_lock_held_(e) +#ifdef _Analysis_assume_lock_not_held_ +#undef _Analysis_assume_lock_not_held_ +#endif +#define _Analysis_assume_lock_not_held_(e) +#ifdef _Analysis_assume_same_lock_ +#undef _Analysis_assume_same_lock_ +#endif +#define _Analysis_assume_same_lock_(e) +#ifdef _In_ +#undef _In_ +#endif +#define _In_ +#ifdef _Out_ +#undef _Out_ +#endif +#define _Out_ +#ifdef _Inout_ +#undef _Inout_ +#endif +#define _Inout_ +#ifdef _In_z_ +#undef _In_z_ +#endif +#define _In_z_ +#ifdef _Inout_z_ +#undef _Inout_z_ +#endif +#define _Inout_z_ +#ifdef _In_reads_ +#undef _In_reads_ +#endif +#define _In_reads_(s) +#ifdef _In_reads_bytes_ +#undef _In_reads_bytes_ +#endif +#define _In_reads_bytes_(s) +#ifdef _In_reads_z_ +#undef _In_reads_z_ +#endif +#define _In_reads_z_(s) +#ifdef _In_reads_or_z_ +#undef _In_reads_or_z_ +#endif +#define _In_reads_or_z_(s) +#ifdef _Out_writes_ +#undef _Out_writes_ +#endif +#define _Out_writes_(s) +#ifdef _Out_writes_bytes_ +#undef _Out_writes_bytes_ +#endif +#define _Out_writes_bytes_(s) +#ifdef _Out_writes_z_ +#undef _Out_writes_z_ +#endif +#define _Out_writes_z_(s) +#ifdef _Inout_updates_ +#undef _Inout_updates_ +#endif +#define _Inout_updates_(s) +#ifdef _Inout_updates_bytes_ +#undef _Inout_updates_bytes_ +#endif +#define _Inout_updates_bytes_(s) +#ifdef _Inout_updates_z_ +#undef _Inout_updates_z_ +#endif +#define _Inout_updates_z_(s) +#ifdef _Out_writes_to_ +#undef _Out_writes_to_ +#endif +#define _Out_writes_to_(s,c) +#ifdef _Out_writes_bytes_to_ +#undef _Out_writes_bytes_to_ +#endif +#define _Out_writes_bytes_to_(s,c) +#ifdef _Out_writes_all_ +#undef _Out_writes_all_ +#endif +#define _Out_writes_all_(s) +#ifdef _Out_writes_bytes_all_ +#undef _Out_writes_bytes_all_ +#endif +#define _Out_writes_bytes_all_(s) +#ifdef _Inout_updates_to_ +#undef _Inout_updates_to_ +#endif +#define _Inout_updates_to_(s,c) +#ifdef _Inout_updates_bytes_to_ +#undef _Inout_updates_bytes_to_ +#endif +#define _Inout_updates_bytes_to_(s,c) +#ifdef _Inout_updates_all_ +#undef _Inout_updates_all_ +#endif +#define _Inout_updates_all_(s) +#ifdef _Inout_updates_bytes_all_ +#undef _Inout_updates_bytes_all_ +#endif +#define _Inout_updates_bytes_all_(s) +#ifdef _In_reads_to_ptr_ +#undef _In_reads_to_ptr_ +#endif +#define _In_reads_to_ptr_(p) +#ifdef _In_reads_to_ptr_z_ +#undef _In_reads_to_ptr_z_ +#endif +#define _In_reads_to_ptr_z_(p) +#ifdef _Out_writes_to_ptr_ +#undef _Out_writes_to_ptr_ +#endif +#define _Out_writes_to_ptr_(p) +#ifdef _Out_writes_to_ptr_z_ +#undef _Out_writes_to_ptr_z_ +#endif +#define _Out_writes_to_ptr_z_(p) +#ifdef _In_opt_ +#undef _In_opt_ +#endif +#define _In_opt_ +#ifdef _Out_opt_ +#undef _Out_opt_ +#endif +#define _Out_opt_ +#ifdef _Inout_opt_ +#undef _Inout_opt_ +#endif +#define _Inout_opt_ +#ifdef _In_opt_z_ +#undef _In_opt_z_ +#endif +#define _In_opt_z_ +#ifdef _Inout_opt_z_ +#undef _Inout_opt_z_ +#endif +#define _Inout_opt_z_ +#ifdef _In_reads_opt_ +#undef _In_reads_opt_ +#endif +#define _In_reads_opt_(s) +#ifdef _In_reads_opt_z_ +#undef _In_reads_opt_z_ +#endif +#define _In_reads_opt_z_(s) +#ifdef _In_reads_bytes_opt_ +#undef _In_reads_bytes_opt_ +#endif +#define _In_reads_bytes_opt_(s) +#ifdef _Out_writes_opt_ +#undef _Out_writes_opt_ +#endif +#define _Out_writes_opt_(s) +#ifdef _Out_writes_bytes_opt_ +#undef _Out_writes_bytes_opt_ +#endif +#define _Out_writes_bytes_opt_(s) +#ifdef _Out_writes_opt_z_ +#undef _Out_writes_opt_z_ +#endif +#define _Out_writes_opt_z_(s) +#ifdef _Inout_updates_opt_ +#undef _Inout_updates_opt_ +#endif +#define _Inout_updates_opt_(s) +#ifdef _Inout_updates_bytes_opt_ +#undef _Inout_updates_bytes_opt_ +#endif +#define _Inout_updates_bytes_opt_(s) +#ifdef _Inout_updates_opt_z_ +#undef _Inout_updates_opt_z_ +#endif +#define _Inout_updates_opt_z_(s) +#ifdef _Out_writes_to_opt_ +#undef _Out_writes_to_opt_ +#endif +#define _Out_writes_to_opt_(s,c) +#ifdef _Out_writes_bytes_to_opt_ +#undef _Out_writes_bytes_to_opt_ +#endif +#define _Out_writes_bytes_to_opt_(s,c) +#ifdef _Out_writes_all_opt_ +#undef _Out_writes_all_opt_ +#endif +#define _Out_writes_all_opt_(s) +#ifdef _Out_writes_bytes_all_opt_ +#undef _Out_writes_bytes_all_opt_ +#endif +#define _Out_writes_bytes_all_opt_(s) +#ifdef _Inout_updates_to_opt_ +#undef _Inout_updates_to_opt_ +#endif +#define _Inout_updates_to_opt_(s,c) +#ifdef _Inout_updates_bytes_to_opt_ +#undef _Inout_updates_bytes_to_opt_ +#endif +#define _Inout_updates_bytes_to_opt_(s,c) +#ifdef _Inout_updates_all_opt_ +#undef _Inout_updates_all_opt_ +#endif +#define _Inout_updates_all_opt_(s) +#ifdef _Inout_updates_bytes_all_opt_ +#undef _Inout_updates_bytes_all_opt_ +#endif +#define _Inout_updates_bytes_all_opt_(s) +#ifdef _In_reads_to_ptr_opt_ +#undef _In_reads_to_ptr_opt_ +#endif +#define _In_reads_to_ptr_opt_(p) +#ifdef _In_reads_to_ptr_opt_z_ +#undef _In_reads_to_ptr_opt_z_ +#endif +#define _In_reads_to_ptr_opt_z_(p) +#ifdef _Out_writes_to_ptr_opt_ +#undef _Out_writes_to_ptr_opt_ +#endif +#define _Out_writes_to_ptr_opt_(p) +#ifdef _Out_writes_to_ptr_opt_z_ +#undef _Out_writes_to_ptr_opt_z_ +#endif +#define _Out_writes_to_ptr_opt_z_(p) +#ifdef _Outptr_ +#undef _Outptr_ +#endif +#define _Outptr_ +#ifdef _Outptr_opt_ +#undef _Outptr_opt_ +#endif +#define _Outptr_opt_ +#ifdef _Outptr_result_maybenull_ +#undef _Outptr_result_maybenull_ +#endif +#define _Outptr_result_maybenull_ +#ifdef _Outptr_opt_result_maybenull_ +#undef _Outptr_opt_result_maybenull_ +#endif +#define _Outptr_opt_result_maybenull_ +#ifdef _Outptr_result_z_ +#undef _Outptr_result_z_ +#endif +#define _Outptr_result_z_ +#ifdef _Outptr_opt_result_z_ +#undef _Outptr_opt_result_z_ +#endif +#define _Outptr_opt_result_z_ +#ifdef _Outptr_result_maybenull_z_ +#undef _Outptr_result_maybenull_z_ +#endif +#define _Outptr_result_maybenull_z_ +#ifdef _Outptr_opt_result_maybenull_z_ +#undef _Outptr_opt_result_maybenull_z_ +#endif +#define _Outptr_opt_result_maybenull_z_ +#ifdef _COM_Outptr_ +#undef _COM_Outptr_ +#endif +#define _COM_Outptr_ +#ifdef _COM_Outptr_opt_ +#undef _COM_Outptr_opt_ +#endif +#define _COM_Outptr_opt_ +#ifdef _COM_Outptr_result_maybenull_ +#undef _COM_Outptr_result_maybenull_ +#endif +#define _COM_Outptr_result_maybenull_ +#ifdef _COM_Outptr_opt_result_maybenull_ +#undef _COM_Outptr_opt_result_maybenull_ +#endif +#define _COM_Outptr_opt_result_maybenull_ +#ifdef _Outptr_result_buffer_ +#undef _Outptr_result_buffer_ +#endif +#define _Outptr_result_buffer_(s) +#ifdef _Outptr_result_bytebuffer_ +#undef _Outptr_result_bytebuffer_ +#endif +#define _Outptr_result_bytebuffer_(s) +#ifdef _Outptr_opt_result_buffer_ +#undef _Outptr_opt_result_buffer_ +#endif +#define _Outptr_opt_result_buffer_(s) +#ifdef _Outptr_opt_result_bytebuffer_ +#undef _Outptr_opt_result_bytebuffer_ +#endif +#define _Outptr_opt_result_bytebuffer_(s) +#ifdef _Outptr_result_buffer_to_ +#undef _Outptr_result_buffer_to_ +#endif +#define _Outptr_result_buffer_to_(s,c) +#ifdef _Outptr_result_bytebuffer_to_ +#undef _Outptr_result_bytebuffer_to_ +#endif +#define _Outptr_result_bytebuffer_to_(s,c) +#ifdef _Outptr_opt_result_buffer_to_ +#undef _Outptr_opt_result_buffer_to_ +#endif +#define _Outptr_opt_result_buffer_to_(s,c) +#ifdef _Outptr_opt_result_bytebuffer_to_ +#undef _Outptr_opt_result_bytebuffer_to_ +#endif +#define _Outptr_opt_result_bytebuffer_to_(s,c) +#ifdef _Ret_ +#undef _Ret_ +#endif +#define _Ret_ +#ifdef _Ret_valid_ +#undef _Ret_valid_ +#endif +#define _Ret_valid_ +#ifdef _Ret_z_ +#undef _Ret_z_ +#endif +#define _Ret_z_ +#ifdef _Ret_writes_ +#undef _Ret_writes_ +#endif +#define _Ret_writes_(s) +#ifdef _Ret_writes_bytes_ +#undef _Ret_writes_bytes_ +#endif +#define _Ret_writes_bytes_(s) +#ifdef _Ret_writes_z_ +#undef _Ret_writes_z_ +#endif +#define _Ret_writes_z_(s) +#ifdef _Ret_writes_to_ +#undef _Ret_writes_to_ +#endif +#define _Ret_writes_to_(s,c) +#ifdef _Ret_writes_bytes_to_ +#undef _Ret_writes_bytes_to_ +#endif +#define _Ret_writes_bytes_to_(s,c) +#ifdef _Ret_writes_to_ptr_ +#undef _Ret_writes_to_ptr_ +#endif +#define _Ret_writes_to_ptr_(p) +#ifdef _Ret_writes_to_ptr_z_ +#undef _Ret_writes_to_ptr_z_ +#endif +#define _Ret_writes_to_ptr_z_(p) +#ifdef _Ret_writes_maybenull_ +#undef _Ret_writes_maybenull_ +#endif +#define _Ret_writes_maybenull_(s) +#ifdef _Ret_writes_bytes_maybenull_ +#undef _Ret_writes_bytes_maybenull_ +#endif +#define _Ret_writes_bytes_maybenull_(s) +#ifdef _Ret_writes_to_maybenull_ +#undef _Ret_writes_to_maybenull_ +#endif +#define _Ret_writes_to_maybenull_(s,c) +#ifdef _Ret_writes_bytes_to_maybenull_ +#undef _Ret_writes_bytes_to_maybenull_ +#endif +#define _Ret_writes_bytes_to_maybenull_(s,c) +#ifdef _Ret_writes_maybenull_z_ +#undef _Ret_writes_maybenull_z_ +#endif +#define _Ret_writes_maybenull_z_(s) +#ifdef _Ret_null_ +#undef _Ret_null_ +#endif +#define _Ret_null_ +#ifdef _Ret_notnull_ +#undef _Ret_notnull_ +#endif +#define _Ret_notnull_ +#ifdef _Ret_maybenull_ +#undef _Ret_maybenull_ +#endif +#define _Ret_maybenull_ +#ifdef _Ret_maybenull_z_ +#undef _Ret_maybenull_z_ +#endif +#define _Ret_maybenull_z_ +#ifdef _Field_size_ +#undef _Field_size_ +#endif +#define _Field_size_(s) +#ifdef _Field_size_opt_ +#undef _Field_size_opt_ +#endif +#define _Field_size_opt_(s) +#ifdef _Field_size_bytes_ +#undef _Field_size_bytes_ +#endif +#define _Field_size_bytes_(s) +#ifdef _Field_size_bytes_opt_ +#undef _Field_size_bytes_opt_ +#endif +#define _Field_size_bytes_opt_(s) +#ifdef _Field_size_part_ +#undef _Field_size_part_ +#endif +#define _Field_size_part_(s,c) +#ifdef _Field_size_part_opt_ +#undef _Field_size_part_opt_ +#endif +#define _Field_size_part_opt_(s,c) +#ifdef _Field_size_bytes_part_ +#undef _Field_size_bytes_part_ +#endif +#define _Field_size_bytes_part_(s,c) +#ifdef _Field_size_bytes_part_opt_ +#undef _Field_size_bytes_part_opt_ +#endif +#define _Field_size_bytes_part_opt_(s,c) +#ifdef _Field_size_full_ +#undef _Field_size_full_ +#endif +#define _Field_size_full_(s) +#ifdef _Field_size_full_opt_ +#undef _Field_size_full_opt_ +#endif +#define _Field_size_full_opt_(s) +#ifdef _Field_size_bytes_full_ +#undef _Field_size_bytes_full_ +#endif +#define _Field_size_bytes_full_(s) +#ifdef _Field_size_bytes_full_opt_ +#undef _Field_size_bytes_full_opt_ +#endif +#define _Field_size_bytes_full_opt_(s) +#ifdef _Printf_format_string_ +#undef _Printf_format_string_ +#endif +#define _Printf_format_string_ +#ifdef _Scanf_format_string_ +#undef _Scanf_format_string_ +#endif +#define _Scanf_format_string_ +#ifdef _Scanf_s_format_string_ +#undef _Scanf_s_format_string_ +#endif +#define _Scanf_s_format_string_ +#ifdef _Printf_format_string_params_ +#undef _Printf_format_string_params_ +#endif +#define _Printf_format_string_params_(x) +#ifdef _Scanf_format_string_params_ +#undef _Scanf_format_string_params_ +#endif +#define _Scanf_format_string_params_(x) +#ifdef _Scanf_s_format_string_params_ +#undef _Scanf_s_format_string_params_ +#endif +#define _Scanf_s_format_string_params_(x) +#ifdef _In_range_ +#undef _In_range_ +#endif +#define _In_range_(l,h) +#ifdef _Out_range_ +#undef _Out_range_ +#endif +#define _Out_range_(l,h) +#ifdef _Ret_range_ +#undef _Ret_range_ +#endif +#define _Ret_range_(l,h) +#ifdef _Deref_in_range_ +#undef _Deref_in_range_ +#endif +#define _Deref_in_range_(l,h) +#ifdef _Deref_out_range_ +#undef _Deref_out_range_ +#endif +#define _Deref_out_range_(l,h) +#ifdef _Deref_inout_range_ +#undef _Deref_inout_range_ +#endif +#define _Deref_inout_range_(l,h) +#ifdef _Field_range_ +#undef _Field_range_ +#endif +#define _Field_range_(l,h) +#ifdef _Pre_equal_to_ +#undef _Pre_equal_to_ +#endif +#define _Pre_equal_to_(e) +#ifdef _Post_equal_to_ +#undef _Post_equal_to_ +#endif +#define _Post_equal_to_(e) +#ifdef _Struct_size_bytes_ +#undef _Struct_size_bytes_ +#endif +#define _Struct_size_bytes_(s) +#ifdef _Analysis_assume_ +#undef _Analysis_assume_ +#endif +#define _Analysis_assume_ +#ifdef _Analysis_mode_ +#undef _Analysis_mode_ +#endif +#define _Analysis_mode_(m) +#ifdef _Analysis_noreturn_ +#undef _Analysis_noreturn_ +#endif +#define _Analysis_noreturn_ +#ifdef _Raises_SEH_exception_ +#undef _Raises_SEH_exception_ +#endif +#define _Raises_SEH_exception_ +#ifdef _Maybe_raises_SEH_exception_ +#undef _Maybe_raises_SEH_exception_ +#endif +#define _Maybe_raises_SEH_exception_ +#ifdef _Function_class_ +#undef _Function_class_ +#endif +#define _Function_class_(n) +#ifdef _Literal_ +#undef _Literal_ +#endif +#define _Literal_ +#ifdef _Notliteral_ +#undef _Notliteral_ +#endif +#define _Notliteral_ +#ifdef _Enum_is_bitflag_ +#undef _Enum_is_bitflag_ +#endif +#define _Enum_is_bitflag_ +#ifdef _Strict_type_match_ +#undef _Strict_type_match_ +#endif +#define _Strict_type_match_ +#ifdef _Points_to_data_ +#undef _Points_to_data_ +#endif +#define _Points_to_data_ +#ifdef _Interlocked_operand_ +#undef _Interlocked_operand_ +#endif +#define _Interlocked_operand_ +#ifdef _IRQL_raises_ +#undef _IRQL_raises_ +#endif +#define _IRQL_raises_(i) +#ifdef _IRQL_requires_ +#undef _IRQL_requires_ +#endif +#define _IRQL_requires_(i) +#ifdef _IRQL_requires_max_ +#undef _IRQL_requires_max_ +#endif +#define _IRQL_requires_max_(i) +#ifdef _IRQL_requires_min_ +#undef _IRQL_requires_min_ +#endif +#define _IRQL_requires_min_(i) +#ifdef _IRQL_saves_ +#undef _IRQL_saves_ +#endif +#define _IRQL_saves_ +#ifdef _IRQL_saves_global_ +#undef _IRQL_saves_global_ +#endif +#define _IRQL_saves_global_(k,s) +#ifdef _IRQL_restores_ +#undef _IRQL_restores_ +#endif +#define _IRQL_restores_ +#ifdef _IRQL_restores_global_ +#undef _IRQL_restores_global_ +#endif +#define _IRQL_restores_global_(k,s) +#ifdef _IRQL_always_function_min_ +#undef _IRQL_always_function_min_ +#endif +#define _IRQL_always_function_min_(i) +#ifdef _IRQL_always_function_max_ +#undef _IRQL_always_function_max_ +#endif +#define _IRQL_always_function_max_(i) +#ifdef _IRQL_requires_same_ +#undef _IRQL_requires_same_ +#endif +#define _IRQL_requires_same_ +#ifdef _IRQL_uses_cancel_ +#undef _IRQL_uses_cancel_ +#endif +#define _IRQL_uses_cancel_ +#ifdef _IRQL_is_cancel_ +#undef _IRQL_is_cancel_ +#endif +#define _IRQL_is_cancel_ +#ifdef _Kernel_float_saved_ +#undef _Kernel_float_saved_ +#endif +#define _Kernel_float_saved_ +#ifdef _Kernel_float_restored_ +#undef _Kernel_float_restored_ +#endif +#define _Kernel_float_restored_ +#ifdef _Kernel_float_used_ +#undef _Kernel_float_used_ +#endif +#define _Kernel_float_used_ +#ifdef _Kernel_acquires_resource_ +#undef _Kernel_acquires_resource_ +#endif +#define _Kernel_acquires_resource_(k) +#ifdef _Kernel_releases_resource_ +#undef _Kernel_releases_resource_ +#endif +#define _Kernel_releases_resource_(k) +#ifdef _Kernel_requires_resource_held_ +#undef _Kernel_requires_resource_held_ +#endif +#define _Kernel_requires_resource_held_(k) +#ifdef _Kernel_requires_resource_not_held_ +#undef _Kernel_requires_resource_not_held_ +#endif +#define _Kernel_requires_resource_not_held_(k) +#ifdef _Kernel_clear_do_init_ +#undef _Kernel_clear_do_init_ +#endif +#define _Kernel_clear_do_init_(yn) +#ifdef _Kernel_IoGetDmaAdapter_ +#undef _Kernel_IoGetDmaAdapter_ +#endif +#define _Kernel_IoGetDmaAdapter_ +#ifdef _Outref_ +#undef _Outref_ +#endif +#define _Outref_ +#ifdef _Outref_result_maybenull_ +#undef _Outref_result_maybenull_ +#endif +#define _Outref_result_maybenull_ +#ifdef _Outref_result_buffer_ +#undef _Outref_result_buffer_ +#endif +#define _Outref_result_buffer_(s) +#ifdef _Outref_result_bytebuffer_ +#undef _Outref_result_bytebuffer_ +#endif +#define _Outref_result_bytebuffer_(s) +#ifdef _Outref_result_buffer_to_ +#undef _Outref_result_buffer_to_ +#endif +#define _Outref_result_buffer_to_(s,c) +#ifdef _Outref_result_bytebuffer_to_ +#undef _Outref_result_bytebuffer_to_ +#endif +#define _Outref_result_bytebuffer_to_(s,c) +#ifdef _Outref_result_buffer_all_ +#undef _Outref_result_buffer_all_ +#endif +#define _Outref_result_buffer_all_(s) +#ifdef _Outref_result_bytebuffer_all_ +#undef _Outref_result_bytebuffer_all_ +#endif +#define _Outref_result_bytebuffer_all_(s) +#ifdef _Outref_result_buffer_maybenull_ +#undef _Outref_result_buffer_maybenull_ +#endif +#define _Outref_result_buffer_maybenull_(s) +#ifdef _Outref_result_bytebuffer_maybenull_ +#undef _Outref_result_bytebuffer_maybenull_ +#endif +#define _Outref_result_bytebuffer_maybenull_(s) +#ifdef _Outref_result_buffer_to_maybenull_ +#undef _Outref_result_buffer_to_maybenull_ +#endif +#define _Outref_result_buffer_to_maybenull_(s,c) +#ifdef _Outref_result_bytebuffer_to_maybenull_ +#undef _Outref_result_bytebuffer_to_maybenull_ +#endif +#define _Outref_result_bytebuffer_to_maybenull_(s,c) +#ifdef _Outref_result_buffer_all_maybenull_ +#undef _Outref_result_buffer_all_maybenull_ +#endif +#define _Outref_result_buffer_all_maybenull_(s) +#ifdef _Outref_result_bytebuffer_all_maybenull_ +#undef _Outref_result_bytebuffer_all_maybenull_ +#endif +#define _Outref_result_bytebuffer_all_maybenull_(s) +#ifdef _In_defensive_ +#undef _In_defensive_ +#endif +#define _In_defensive_(a) +#ifdef _Out_defensive_ +#undef _Out_defensive_ +#endif +#define _Out_defensive_(a) +#ifdef _Inout_defensive_ +#undef _Inout_defensive_ +#endif +#define _Inout_defensive_(a) +#ifdef _Outptr_result_nullonfailure_ +#undef _Outptr_result_nullonfailure_ +#endif +#define _Outptr_result_nullonfailure_ +#ifdef _Outptr_opt_result_nullonfailure_ +#undef _Outptr_opt_result_nullonfailure_ +#endif +#define _Outptr_opt_result_nullonfailure_ +#ifdef _Outref_result_nullonfailure_ +#undef _Outref_result_nullonfailure_ +#endif +#define _Outref_result_nullonfailure_ +#ifdef _Result_nullonfailure_ +#undef _Result_nullonfailure_ +#endif +#define _Result_nullonfailure_ +#ifdef _Result_zeroonfailure_ +#undef _Result_zeroonfailure_ +#endif +#define _Result_zeroonfailure_ +#ifdef _Acquires_nonreentrant_lock_ +#undef _Acquires_nonreentrant_lock_ +#endif +#define _Acquires_nonreentrant_lock_(e) +#ifdef _Releases_nonreentrant_lock_ +#undef _Releases_nonreentrant_lock_ +#endif +#define _Releases_nonreentrant_lock_(e) +#ifdef _Function_ignore_lock_checking_ +#undef _Function_ignore_lock_checking_ +#endif +#define _Function_ignore_lock_checking_(e) +#ifdef _Analysis_suppress_lock_checking_ +#undef _Analysis_suppress_lock_checking_ +#endif +#define _Analysis_suppress_lock_checking_(e) +#undef _Reserved_ +#define _Reserved_ _Pre_equal_to_(0) _Pre_ _Null_ +#undef _Pre_z_ +#define _Pre_z_ _Pre_ _Null_terminated_ +#undef _Post_z_ +#define _Post_z_ _Post_ _Null_terminated_ +#undef _Prepost_z_ +#define _Prepost_z_ _Pre_z_ _Post_z_ +#undef _Pre_null_ +#define _Pre_null_ _Pre_ _Null_ +#undef _Pre_maybenull_ +#define _Pre_maybenull_ _Pre_ _Maybenull_ +#undef _Pre_notnull_ +#define _Pre_notnull_ _Pre_ _Notnull_ +#undef _Pre_valid_ +#define _Pre_valid_ _Pre_notnull_ _Pre_ _Valid_ +#undef _Pre_opt_valid_ +#define _Pre_opt_valid_ _Pre_maybenull_ _Pre_ _Valid_ +#undef _Post_valid_ +#define _Post_valid_ _Post_ _Valid_ +#undef _Post_invalid_ +#define _Post_invalid_ _Post_ _Deref_ _Notvalid_ +#undef _Post_ptr_invalid_ +#define _Post_ptr_invalid_ _Post_ _Notvalid_ +#undef _Pre_readable_size_ +#define _Pre_readable_size_(s) _Pre_ _Readable_elements_(s) _Pre_ _Valid_ +#undef _Pre_writable_size_ +#define _Pre_writable_size_(s) _Pre_ _Writable_elements_(s) +#undef _Pre_readable_byte_size_ +#define _Pre_readable_byte_size_(s) _Pre_ _Readable_bytes_(s) _Pre_ _Valid_ +#undef _Pre_writable_byte_size_ +#define _Pre_writable_byte_size_(s) _Pre_ _Writable_bytes_(s) +#undef _Post_readable_size_ +#define _Post_readable_size_(s) _Post_ _Readable_elements_(s) _Post_ _Valid_ +#undef _Post_writable_size_ +#define _Post_writable_size_(s) _Post_ _Writable_elements_(s) +#undef _Post_readable_byte_size_ +#define _Post_readable_byte_size_(s) _Post_ _Readable_bytes_(s) _Post_ _Valid_ +#undef _Post_writable_byte_size_ +#define _Post_writable_byte_size_(s) _Post_ _Writable_bytes_(s) + +#endif /* _NO_SAL_2_H_ */ diff --git a/Minecraft.Client/PS3/PS3Extras/DirectX/sal.h b/Minecraft.Client/PS3/PS3Extras/DirectX/sal.h new file mode 100644 index 00000000..3576d7ed --- /dev/null +++ b/Minecraft.Client/PS3/PS3Extras/DirectX/sal.h @@ -0,0 +1,1998 @@ +/*** +*sal.h - markers for documenting the semantics of APIs +* +* Copyright (c) Microsoft Corporation. All rights reserved. +* +*Purpose: +* sal.h provides a set of annotations to describe how a function uses its +* parameters - the assumptions it makes about them, and the guarantees it makes +* upon finishing. +* +* [Public] +* +****/ + +#pragma once +/*========================================================================== + + The macros are defined in 3 layers: + + _In_\_Out_ Layer: + ---------------- + This layer provides the highest abstraction and its macros should be used + in most cases. Its macros start with _In_, _Out_ or _Inout_. For the + typical case they provide the most concise annotations. + + _Pre_\_Post_ Layer: + ------------------ + The macros of this layer only should be used when there is no suitable macro + in the _In_\_Out_ layer. Its macros start with _Pre_, _Post_, _Ret_, + _Deref_pre_ _Deref_post_ and _Deref_ret_. This layer provides the most + flexibility for annotations. + + Implementation Abstraction Layer: + -------------------------------- + Macros from this layer should never be used directly. The layer only exists + to hide the implementation of the annotation macros. + + + Annotation Syntax: + |--------------|----------|----------------|-----------------------------| + | Usage | Nullness | ZeroTerminated | Extent | + |--------------|----------|----------------|-----------------------------| + | _In_ | <> | <> | <> | + | _Out_ | opt_ | z_ | [byte]cap_[c_|x_]( size ) | + | _Inout_ | | | [byte]count_[c_|x_]( size ) | + | _Deref_out_ | | | ptrdiff_cap_( ptr ) | + |--------------| | | ptrdiff_count_( ptr ) | + | _Ret_ | | | | + | _Deref_ret_ | | | | + |--------------| | | | + | _Pre_ | | | | + | _Post_ | | | | + | _Deref_pre_ | | | | + | _Deref_post_ | | | | + |--------------|----------|----------------|-----------------------------| + + Usage: + ----- + _In_, _Out_, _Inout_, _Pre_, _Post_, _Deref_pre_, _Deref_post_ are for + formal parameters. + _Ret_, _Deref_ret_ must be used for return values. + + Nullness: + -------- + If the pointer can be NULL the annotation contains _opt. If the macro + does not contain '_opt' the pointer may not be NULL. + + String Type: + ----------- + _z: NullTerminated string + for _In_ parameters the buffer must have the specified stringtype before the call + for _Out_ parameters the buffer must have the specified stringtype after the call + for _Inout_ parameters both conditions apply + + Extent Syntax: + |------|---------------|---------------| + | Unit | Writ\Readable | Argument Type | + |------|---------------|---------------| + | <> | cap_ | <> | + | byte | count_ | c_ | + | | | x_ | + |------|---------------|---------------| + + 'cap' (capacity) describes the writable size of the buffer and is typically used + with _Out_. The default unit is elements. Use 'bytecap' if the size is given in bytes + 'count' describes the readable size of the buffer and is typically used with _In_. + The default unit is elements. Use 'bytecount' if the size is given in bytes. + + Argument syntax for cap_, bytecap_, count_, bytecount_: + (<parameter>|return)[+n] e.g. cch, return, cb+2 + + If the buffer size is a constant expression use the c_ postfix. + E.g. cap_c_(20), count_c_(MAX_PATH), bytecount_c_(16) + + If the buffer size is given by a limiting pointer use the ptrdiff_ versions + of the macros. + + If the buffer size is neither a parameter nor a constant expression use the x_ + postfix. e.g. bytecount_x_(num*size) x_ annotations accept any arbitrary string. + No analysis can be done for x_ annotations but they at least tell the tool that + the buffer has some sort of extent description. x_ annotations might be supported + by future compiler versions. + +============================================================================*/ + +#define __ATTR_SAL + +#ifdef _PREFAST_ +// choose attribute or __declspec implementation +#ifndef _USE_DECLSPECS_FOR_SAL +#define _USE_DECLSPECS_FOR_SAL 0 +#endif + +#if _USE_DECLSPECS_FOR_SAL +#undef _USE_ATTRIBUTES_FOR_SAL +#define _USE_ATTRIBUTES_FOR_SAL 0 +#elif !defined(_USE_ATTRIBUTES_FOR_SAL) +#if _MSC_VER >= 1500 +#define _USE_ATTRIBUTES_FOR_SAL 1 +#else +#define _USE_ATTRIBUTES_FOR_SAL 0 +#endif // if _MSC_VER >= 1400 +#endif // if _USE_DECLSPECS_FOR_SAL + + +#if !_USE_DECLSPECS_FOR_SAL +#if !_USE_ATTRIBUTES_FOR_SAL +#if _MSC_VER >= 1500 +#undef _USE_ATTRIBUTES_FOR_SAL +#define _USE_ATTRIBUTES_FOR_SAL 1 +#else +#undef _USE_DECLSPECS_FOR_SAL +#define _USE_DECLSPECS_FOR_SAL 1 +#endif // _MSC_VER >= 1400 +#endif // !_USE_ATTRIBUTES_FOR_SAL +#endif // !_USE_DECLSPECS_FOR_SAL + +#endif // #ifdef _PREFAST_ + +// Disable expansion of SAL macros in non-Prefast mode to +// improve compiler throughput. +#ifndef _USE_DECLSPECS_FOR_SAL +#define _USE_DECLSPECS_FOR_SAL 0 +#endif +#ifndef _USE_ATTRIBUTES_FOR_SAL +#define _USE_ATTRIBUTES_FOR_SAL 0 +#endif + +// safeguard for MIDL and RC builds +#if _USE_DECLSPECS_FOR_SAL && ( defined( MIDL_PASS ) || defined(__midl) || defined(RC_INVOKED) || !defined(_PREFAST_) ) +#undef _USE_DECLSPECS_FOR_SAL +#define _USE_DECLSPECS_FOR_SAL 0 +#endif +#if _USE_ATTRIBUTES_FOR_SAL && ( !defined(_MSC_EXTENSIONS) || defined( MIDL_PASS ) || defined(__midl) || defined(RC_INVOKED) ) +#undef _USE_ATTRIBUTES_FOR_SAL +#define _USE_ATTRIBUTES_FOR_SAL 0 +#endif + +#if defined(_MSC_EXTENSIONS) && !defined( MIDL_PASS ) && !defined(__midl) && !defined(RC_INVOKED) +#include "codeanalysis\sourceannotations.h" +#endif + +//============================================================================ +// _In_\_Out_ Layer: +//============================================================================ + +// 'in' parameters -------------------------- + +// input pointer parameter +// e.g. void SetPoint( _In_ const POINT* pPT ); +#define _In_ _Pre1_impl_(_$notnull) _Deref_pre2_impl_(_$valid, _$readaccess) +#define _In_opt_ _Pre_opt_valid_ _Deref_pre_readonly_ + +// nullterminated 'in' parameters. +// e.g. void CopyStr( _In_z_ const char* szFrom, _Out_z_cap_(cchTo) char* szTo, size_t cchTo ); +#define _In_z_ _Pre_z_ _Deref_pre_readonly_ +#define _In_opt_z_ _Pre_opt_z_ _Deref_pre_readonly_ + +// 'input' buffers with given size + +// e.g. void SetCharRange( _In_count_(cch) const char* rgch, size_t cch ) +// valid buffer extent described by another parameter +#define _In_count_(size) _Pre_count_(size) _Deref_pre_readonly_ +#define _In_opt_count_(size) _Pre_opt_count_(size) _Deref_pre_readonly_ +#define _In_bytecount_(size) _Pre_bytecount_(size) _Deref_pre_readonly_ +#define _In_opt_bytecount_(size) _Pre_opt_bytecount_(size) _Deref_pre_readonly_ + +// valid buffer extent described by a constant extression +#define _In_count_c_(size) _Pre_count_c_(size) _Deref_pre_readonly_ +#define _In_opt_count_c_(size) _Pre_opt_count_c_(size) _Deref_pre_readonly_ +#define _In_bytecount_c_(size) _Pre_bytecount_c_(size) _Deref_pre_readonly_ +#define _In_opt_bytecount_c_(size) _Pre_opt_bytecount_c_(size) _Deref_pre_readonly_ + +// nullterminated 'input' buffers with given size + +// e.g. void SetCharRange( _In_count_(cch) const char* rgch, size_t cch ) +// nullterminated valid buffer extent described by another parameter +#define _In_z_count_(size) _Pre_z_ _Pre_count_(size) _Deref_pre_readonly_ +#define _In_opt_z_count_(size) _Pre_opt_z_ _Pre_opt_count_(size) _Deref_pre_readonly_ +#define _In_z_bytecount_(size) _Pre_z_ _Pre_bytecount_(size) _Deref_pre_readonly_ +#define _In_opt_z_bytecount_(size) _Pre_opt_z_ _Pre_opt_bytecount_(size) _Deref_pre_readonly_ + +// nullterminated valid buffer extent described by a constant extression +#define _In_z_count_c_(size) _Pre_z_ _Pre_count_c_(size) _Deref_pre_readonly_ +#define _In_opt_z_count_c_(size) _Pre_opt_z_ _Pre_opt_count_c_(size) _Deref_pre_readonly_ +#define _In_z_bytecount_c_(size) _Pre_z_ _Pre_bytecount_c_(size) _Deref_pre_readonly_ +#define _In_opt_z_bytecount_c_(size) _Pre_opt_z_ _Pre_opt_bytecount_c_(size) _Deref_pre_readonly_ + +// buffer capacity is described by another pointer +// e.g. void Foo( _In_ptrdiff_count_(pchMax) const char* pch, const char* pchMax ) { while pch < pchMax ) pch++; } +#define _In_ptrdiff_count_(size) _Pre_ptrdiff_count_(size) _Deref_pre_readonly_ +#define _In_opt_ptrdiff_count_(size) _Pre_opt_ptrdiff_count_(size) _Deref_pre_readonly_ + +// 'x' version for complex expressions that are not supported by the current compiler version +// e.g. void Set3ColMatrix( _In_count_x_(3*cRows) const Elem* matrix, int cRows ); +#define _In_count_x_(size) _Pre_count_x_(size) _Deref_pre_readonly_ +#define _In_opt_count_x_(size) _Pre_opt_count_x_(size) _Deref_pre_readonly_ +#define _In_bytecount_x_(size) _Pre_bytecount_x_(size) _Deref_pre_readonly_ +#define _In_opt_bytecount_x_(size) _Pre_opt_bytecount_x_(size) _Deref_pre_readonly_ + +// 'out' parameters -------------------------- + +// output pointer parameter +// e.g. void GetPoint( _Out_ POINT* pPT ); +#define _Out_ _Pre_cap_c_(1) _Pre_invalid_ +#define _Out_opt_ _Pre_opt_cap_c_(1) _Pre_invalid_ + +// 'out' with buffer size +// e.g. void GetIndeces( _Out_cap_(cIndeces) int* rgIndeces, size_t cIndices ); +// buffer capacity is described by another parameter +#define _Out_cap_(size) _Pre_cap_(size) _Pre_invalid_ +#define _Out_opt_cap_(size) _Pre_opt_cap_(size) _Pre_invalid_ +#define _Out_bytecap_(size) _Pre_bytecap_(size) _Pre_invalid_ +#define _Out_opt_bytecap_(size) _Pre_opt_bytecap_(size) _Pre_invalid_ + +// buffer capacity is described by a constant expression +#define _Out_cap_c_(size) _Pre_cap_c_(size) _Pre_invalid_ +#define _Out_opt_cap_c_(size) _Pre_opt_cap_c_(size) _Pre_invalid_ +#define _Out_bytecap_c_(size) _Pre_bytecap_c_(size) _Pre_invalid_ +#define _Out_opt_bytecap_c_(size) _Pre_opt_bytecap_c_(size) _Pre_invalid_ + +// buffer capacity is described by another parameter multiplied by a constant expression +#define _Out_cap_m_(mult,size) _Pre_cap_m_(mult,size) _Pre_invalid_ +#define _Out_opt_cap_m_(mult,size) _Pre_opt_cap_m_(mult,size) _Pre_invalid_ +#define _Out_z_cap_m_(mult,size) _Pre_cap_m_(mult,size) _Pre_invalid_ _Post_z_ +#define _Out_opt_z_cap_m_(mult,size) _Pre_opt_cap_m_(mult,size) _Pre_invalid_ _Post_z_ + +// buffer capacity is described by another pointer +// e.g. void Foo( _Out_ptrdiff_cap_(pchMax) char* pch, const char* pchMax ) { while pch < pchMax ) pch++; } +#define _Out_ptrdiff_cap_(size) _Pre_ptrdiff_cap_(size) _Pre_invalid_ +#define _Out_opt_ptrdiff_cap_(size) _Pre_opt_ptrdiff_cap_(size) _Pre_invalid_ + +// buffer capacity is described by a complex expression +#define _Out_cap_x_(size) _Pre_cap_x_(size) _Pre_invalid_ +#define _Out_opt_cap_x_(size) _Pre_opt_cap_x_(size) _Pre_invalid_ +#define _Out_bytecap_x_(size) _Pre_bytecap_x_(size) _Pre_invalid_ +#define _Out_opt_bytecap_x_(size) _Pre_opt_bytecap_x_(size) _Pre_invalid_ + +// a zero terminated string is filled into a buffer of given capacity +// e.g. void CopyStr( _In_z_ const char* szFrom, _Out_z_cap_(cchTo) char* szTo, size_t cchTo ); +// buffer capacity is described by another parameter +#define _Out_z_cap_(size) _Pre_cap_(size) _Pre_invalid_ _Post_z_ +#define _Out_opt_z_cap_(size) _Pre_opt_cap_(size) _Pre_invalid_ _Post_z_ +#define _Out_z_bytecap_(size) _Pre_bytecap_(size) _Pre_invalid_ _Post_z_ +#define _Out_opt_z_bytecap_(size) _Pre_opt_bytecap_(size) _Pre_invalid_ _Post_z_ + +// buffer capacity is described by a constant expression +#define _Out_z_cap_c_(size) _Pre_cap_c_(size) _Pre_invalid_ _Post_z_ +#define _Out_opt_z_cap_c_(size) _Pre_opt_cap_c_(size) _Pre_invalid_ _Post_z_ +#define _Out_z_bytecap_c_(size) _Pre_bytecap_c_(size) _Pre_invalid_ _Post_z_ +#define _Out_opt_z_bytecap_c_(size) _Pre_opt_bytecap_c_(size) _Pre_invalid_ _Post_z_ + +// buffer capacity is described by a complex expression +#define _Out_z_cap_x_(size) _Pre_cap_x_(size) _Pre_invalid_ _Post_z_ +#define _Out_opt_z_cap_x_(size) _Pre_opt_cap_x_(size) _Pre_invalid_ _Post_z_ +#define _Out_z_bytecap_x_(size) _Pre_bytecap_x_(size) _Pre_invalid_ _Post_z_ +#define _Out_opt_z_bytecap_x_(size) _Pre_opt_bytecap_x_(size) _Pre_invalid_ _Post_z_ + +// a zero terminated string is filled into a buffer of given capacity +// e.g. size_t CopyCharRange( _In_count_(cchFrom) const char* rgchFrom, size_t cchFrom, _Out_cap_post_count_(cchTo,return)) char* rgchTo, size_t cchTo ); +#define _Out_cap_post_count_(cap,count) _Pre_cap_(cap) _Pre_invalid_ _Post_count_(count) +#define _Out_opt_cap_post_count_(cap,count) _Pre_opt_cap_(cap) _Pre_invalid_ _Post_count_(count) +#define _Out_bytecap_post_bytecount_(cap,count) _Pre_bytecap_(cap) _Pre_invalid_ _Post_bytecount_(count) +#define _Out_opt_bytecap_post_bytecount_(cap,count) _Pre_opt_bytecap_(cap) _Pre_invalid_ _Post_bytecount_(count) + +// a zero terminated string is filled into a buffer of given capacity +// e.g. size_t CopyStr( _In_z_ const char* szFrom, _Out_z_cap_post_count_(cchTo,return+1) char* szTo, size_t cchTo ); +#define _Out_z_cap_post_count_(cap,count) _Pre_cap_(cap) _Pre_invalid_ _Post_z_count_(count) +#define _Out_opt_z_cap_post_count_(cap,count) _Pre_opt_cap_(cap) _Pre_invalid_ _Post_z_count_(count) +#define _Out_z_bytecap_post_bytecount_(cap,count) _Pre_bytecap_(cap) _Pre_invalid_ _Post_z_bytecount_(count) +#define _Out_opt_z_bytecap_post_bytecount_(cap,count) _Pre_opt_bytecap_(cap) _Pre_invalid_ _Post_z_bytecount_(count) + +// only use with dereferenced arguments e.g. '*pcch' +#define _Out_capcount_(capcount) _Pre_cap_(capcount) _Pre_invalid_ _Post_count_(capcount) +#define _Out_opt_capcount_(capcount) _Pre_opt_cap_(capcount) _Pre_invalid_ _Post_count_(capcount) +#define _Out_bytecapcount_(capcount) _Pre_bytecap_(capcount) _Pre_invalid_ _Post_bytecount_(capcount) +#define _Out_opt_bytecapcount_(capcount) _Pre_opt_bytecap_(capcount) _Pre_invalid_ _Post_bytecount_(capcount) + +#define _Out_capcount_x_(capcount) _Pre_cap_x_(capcount) _Pre_invalid_ _Post_count_x_(capcount) +#define _Out_opt_capcount_x_(capcount) _Pre_opt_cap_x_(capcount) _Pre_invalid_ _Post_count_x_(capcount) +#define _Out_bytecapcount_x_(capcount) _Pre_bytecap_x_(capcount) _Pre_invalid_ _Post_bytecount_x_(capcount) +#define _Out_opt_bytecapcount_x_(capcount) _Pre_opt_bytecap_x_(capcount) _Pre_invalid_ _Post_bytecount_x_(capcount) + +// e.g. GetString( _Out_z_capcount_(*pLen+1) char* sz, size_t* pLen ); +#define _Out_z_capcount_(capcount) _Pre_cap_(capcount) _Pre_invalid_ _Post_z_count_(capcount) +#define _Out_opt_z_capcount_(capcount) _Pre_opt_cap_(capcount) _Pre_invalid_ _Post_z_count_(capcount) +#define _Out_z_bytecapcount_(capcount) _Pre_bytecap_(capcount) _Pre_invalid_ _Post_z_bytecount_(capcount) +#define _Out_opt_z_bytecapcount_(capcount) _Pre_opt_bytecap_(capcount) _Pre_invalid_ _Post_z_bytecount_(capcount) + +// inout parameters ---------------------------- + +// inout pointer parameter +// e.g. void ModifyPoint( _Inout_ POINT* pPT ); +#define _Inout_ _Prepost_valid_ +#define _Inout_opt_ _Prepost_opt_valid_ + +// string buffers +// e.g. void toupper( _Inout_z_ char* sz ); +#define _Inout_z_ _Prepost_z_ +#define _Inout_opt_z_ _Prepost_opt_z_ + +// 'inout' buffers with initialized elements before and after the call +// e.g. void ModifyIndices( _Inout_count_(cIndices) int* rgIndeces, size_t cIndices ); +#define _Inout_count_(size) _Prepost_count_(size) +#define _Inout_opt_count_(size) _Prepost_opt_count_(size) +#define _Inout_bytecount_(size) _Prepost_bytecount_(size) +#define _Inout_opt_bytecount_(size) _Prepost_opt_bytecount_(size) + +#define _Inout_count_c_(size) _Prepost_count_c_(size) +#define _Inout_opt_count_c_(size) _Prepost_opt_count_c_(size) +#define _Inout_bytecount_c_(size) _Prepost_bytecount_c_(size) +#define _Inout_opt_bytecount_c_(size) _Prepost_opt_bytecount_c_(size) + +// nullterminated 'inout' buffers with initialized elements before and after the call +// e.g. void ModifyIndices( _Inout_count_(cIndices) int* rgIndeces, size_t cIndices ); +#define _Inout_z_count_(size) _Prepost_z_ _Prepost_count_(size) +#define _Inout_opt_z_count_(size) _Prepost_z_ _Prepost_opt_count_(size) +#define _Inout_z_bytecount_(size) _Prepost_z_ _Prepost_bytecount_(size) +#define _Inout_opt_z_bytecount_(size) _Prepost_z_ _Prepost_opt_bytecount_(size) + +#define _Inout_z_count_c_(size) _Prepost_z_ _Prepost_count_c_(size) +#define _Inout_opt_z_count_c_(size) _Prepost_z_ _Prepost_opt_count_c_(size) +#define _Inout_z_bytecount_c_(size) _Prepost_z_ _Prepost_bytecount_c_(size) +#define _Inout_opt_z_bytecount_c_(size) _Prepost_z_ _Prepost_opt_bytecount_c_(size) + +#define _Inout_ptrdiff_count_(size) _Pre_ptrdiff_count_(size) +#define _Inout_opt_ptrdiff_count_(size) _Pre_opt_ptrdiff_count_(size) + +#define _Inout_count_x_(size) _Prepost_count_x_(size) +#define _Inout_opt_count_x_(size) _Prepost_opt_count_x_(size) +#define _Inout_bytecount_x_(size) _Prepost_bytecount_x_(size) +#define _Inout_opt_bytecount_x_(size) _Prepost_opt_bytecount_x_(size) + +// e.g. void AppendToLPSTR( _In_ LPCSTR szFrom, _Inout_cap_(cchTo) LPSTR* szTo, size_t cchTo ); +#define _Inout_cap_(size) _Pre_valid_cap_(size) _Post_valid_ +#define _Inout_opt_cap_(size) _Pre_opt_valid_cap_(size) _Post_valid_ +#define _Inout_bytecap_(size) _Pre_valid_bytecap_(size) _Post_valid_ +#define _Inout_opt_bytecap_(size) _Pre_opt_valid_bytecap_(size) _Post_valid_ + +#define _Inout_cap_c_(size) _Pre_valid_cap_c_(size) _Post_valid_ +#define _Inout_opt_cap_c_(size) _Pre_opt_valid_cap_c_(size) _Post_valid_ +#define _Inout_bytecap_c_(size) _Pre_valid_bytecap_c_(size) _Post_valid_ +#define _Inout_opt_bytecap_c_(size) _Pre_opt_valid_bytecap_c_(size) _Post_valid_ + +#define _Inout_cap_x_(size) _Pre_valid_cap_x_(size) _Post_valid_ +#define _Inout_opt_cap_x_(size) _Pre_opt_valid_cap_x_(size) _Post_valid_ +#define _Inout_bytecap_x_(size) _Pre_valid_bytecap_x_(size) _Post_valid_ +#define _Inout_opt_bytecap_x_(size) _Pre_opt_valid_bytecap_x_(size) _Post_valid_ + +// inout string buffers with writable size +// e.g. void AppendStr( _In_z_ const char* szFrom, _Inout_z_cap_(cchTo) char* szTo, size_t cchTo ); +#define _Inout_z_cap_(size) _Pre_z_cap_(size) _Post_z_ +#define _Inout_opt_z_cap_(size) _Pre_opt_z_cap_(size) _Post_z_ +#define _Inout_z_bytecap_(size) _Pre_z_bytecap_(size) _Post_z_ +#define _Inout_opt_z_bytecap_(size) _Pre_opt_z_bytecap_(size) _Post_z_ + +#define _Inout_z_cap_c_(size) _Pre_z_cap_c_(size) _Post_z_ +#define _Inout_opt_z_cap_c_(size) _Pre_opt_z_cap_c_(size) _Post_z_ +#define _Inout_z_bytecap_c_(size) _Pre_z_bytecap_c_(size) _Post_z_ +#define _Inout_opt_z_bytecap_c_(size) _Pre_opt_z_bytecap_c_(size) _Post_z_ + +#define _Inout_z_cap_x_(size) _Pre_z_cap_x_(size) _Post_z_ +#define _Inout_opt_z_cap_x_(size) _Pre_opt_z_cap_x_(size) _Post_z_ +#define _Inout_z_bytecap_x_(size) _Pre_z_bytecap_x_(size) _Post_z_ +#define _Inout_opt_z_bytecap_x_(size) _Pre_opt_z_bytecap_x_(size) _Post_z_ + +// return values ------------------------------- + +// returning pointers to valid objects +#define _Ret_ _Ret_valid_ +#define _Ret_opt_ _Ret_opt_valid_ + +// More _Ret_ annotations are defined below + +// Pointer to pointers ------------------------- + +// e.g. HRESULT HrCreatePoint( _Deref_out_opt_ POINT** ppPT ); +#define _Deref_out_ _Out_ _Deref_pre_invalid_ _Deref_post_valid_ +#define _Deref_out_opt_ _Out_ _Deref_pre_invalid_ _Deref_post_opt_valid_ +#define _Deref_opt_out_ _Out_opt_ _Deref_pre_invalid_ _Deref_post_valid_ +#define _Deref_opt_out_opt_ _Out_opt_ _Deref_pre_invalid_ _Deref_post_opt_valid_ + +// e.g. void CloneString( _In_z_ const wchar_t* wzFrom, _Deref_out_z_ wchar_t** pWzTo ); +#define _Deref_out_z_ _Out_ _Deref_pre_invalid_ _Deref_post_z_ +#define _Deref_out_opt_z_ _Out_ _Deref_pre_invalid_ _Deref_post_opt_z_ +#define _Deref_opt_out_z_ _Out_opt_ _Deref_pre_invalid_ _Deref_post_z_ +#define _Deref_opt_out_opt_z_ _Out_opt_ _Deref_pre_invalid_ _Deref_post_opt_z_ + +// More _Deref_ annotations are defined below + +// Other annotations + +// Check the return value of a function e.g. _Check_return_ ErrorCode Foo(); +#define _Check_return_ _Check_return_impl_ + +// e.g. MyPrintF( _Printf_format_string_ const wchar_t* wzFormat, ... ); +#define _Printf_format_string_ _Printf_format_string_impl_ +#define _Scanf_format_string_ _Scanf_format_string_impl_ +#define _Scanf_s_format_string_ _Scanf_s_format_string_impl_ +#define _FormatMessage_format_string_ + +// <expr> indicates whether post conditions apply +#define _Success_(expr) _Success_impl_(expr) + +// annotations to express 'boundedness' of integral value parameter +#define _In_bound_ _In_bound_impl_ +#define _Out_bound_ _Out_bound_impl_ +#define _Ret_bound_ _Ret_bound_impl_ +#define _Deref_in_bound_ _Deref_in_bound_impl_ +#define _Deref_out_bound_ _Deref_out_bound_impl_ +#define _Deref_inout_bound_ _Deref_in_bound_ _Deref_out_bound_ +#define _Deref_ret_bound_ _Deref_ret_bound_impl_ + +// annotations to express upper and lower bounds of integral value parameter +#define _In_range_(lb,ub) _In_range_impl_(lb,ub) +#define _Out_range_(lb,ub) _Out_range_impl_(lb,ub) +#define _Ret_range_(lb,ub) _Ret_range_impl_(lb,ub) +#define _Deref_in_range_(lb,ub) _Deref_in_range_impl_(lb,ub) +#define _Deref_out_range_(lb,ub) _Deref_out_range_impl_(lb,ub) +#define _Deref_ret_range_(lb,ub) _Deref_ret_range_impl_(lb,ub) + +//============================================================================ +// _Pre_\_Post_ Layer: +//============================================================================ + +// +// _Pre_ annotation --- +// +// describing conditions that must be met before the call of the function + +// e.g. int strlen( _Pre_z_ const char* sz ); +// buffer is a zero terminated string +#define _Pre_z_ _Pre2_impl_(_$notnull, _$zterm) _Deref_pre1_impl_(_$valid) +#define _Pre_opt_z_ _Pre2_impl_(_$maybenull,_$zterm) _Deref_pre1_impl_(_$valid) + +// e.g. void FreeMemory( _Pre_bytecap_(cb) _Post_ptr_invalid_ void* pv, size_t cb ); +// buffer capacity described by another parameter +#define _Pre_cap_(size) _Pre2_impl_(_$notnull, _$cap(size)) +#define _Pre_opt_cap_(size) _Pre2_impl_(_$maybenull,_$cap(size)) +#define _Pre_bytecap_(size) _Pre2_impl_(_$notnull, _$bytecap(size)) +#define _Pre_opt_bytecap_(size) _Pre2_impl_(_$maybenull,_$bytecap(size)) + +// buffer capacity described by a constant expression +#define _Pre_cap_c_(size) _Pre2_impl_(_$notnull, _$cap_c(size)) +#define _Pre_opt_cap_c_(size) _Pre2_impl_(_$maybenull,_$cap_c(size)) +#define _Pre_bytecap_c_(size) _Pre2_impl_(_$notnull, _$bytecap_c(size)) +#define _Pre_opt_bytecap_c_(size) _Pre2_impl_(_$maybenull,_$bytecap_c(size)) + +// buffer capacity is described by another parameter multiplied by a constant expression +#define _Pre_cap_m_(mult,size) _Pre2_impl_(_$notnull, _$mult(mult,size)) +#define _Pre_opt_cap_m_(mult,size) _Pre2_impl_(_$maybenull,_$mult(mult,size)) + +// buffer capacity described by size of other buffer, only used by dangerous legacy APIs +// e.g. int strcpy(_Pre_cap_for_(src) char* dst, const char* src); +#define _Pre_cap_for_(param) _Pre2_impl_(_$notnull, _$cap_for(param)) +#define _Pre_opt_cap_for_(param) _Pre2_impl_(_$maybenull,_$cap_for(param)) + +// buffer capacity described by a complex condition +#define _Pre_cap_x_(size) _Pre2_impl_(_$notnull, _$cap_x(size)) +#define _Pre_opt_cap_x_(size) _Pre2_impl_(_$maybenull,_$cap_x(size)) +#define _Pre_bytecap_x_(size) _Pre2_impl_(_$notnull, _$bytecap_x(size)) +#define _Pre_opt_bytecap_x_(size) _Pre2_impl_(_$maybenull,_$bytecap_x(size)) + +// buffer capacity described by the difference to another pointer parameter +#define _Pre_ptrdiff_cap_(ptr) _Pre2_impl_(_$notnull, _$cap_x(__ptrdiff(ptr))) +#define _Pre_opt_ptrdiff_cap_(ptr) _Pre2_impl_(_$maybenull,_$cap_x(__ptrdiff(ptr))) + +// e.g. void AppendStr( _Pre_z_ const char* szFrom, _Pre_z_cap_(cchTo) _Post_z_ char* szTo, size_t cchTo ); +#define _Pre_z_cap_(size) _Pre3_impl_(_$notnull, _$zterm,_$cap(size)) _Deref_pre1_impl_(_$valid) +#define _Pre_opt_z_cap_(size) _Pre3_impl_(_$maybenull,_$zterm,_$cap(size)) _Deref_pre1_impl_(_$valid) +#define _Pre_z_bytecap_(size) _Pre3_impl_(_$notnull, _$zterm,_$bytecap(size)) _Deref_pre1_impl_(_$valid) +#define _Pre_opt_z_bytecap_(size) _Pre3_impl_(_$maybenull,_$zterm,_$bytecap(size)) _Deref_pre1_impl_(_$valid) + +#define _Pre_z_cap_c_(size) _Pre3_impl_(_$notnull, _$zterm,_$cap_c(size)) _Deref_pre1_impl_(_$valid) +#define _Pre_opt_z_cap_c_(size) _Pre3_impl_(_$maybenull,_$zterm,_$cap_c(size)) _Deref_pre1_impl_(_$valid) +#define _Pre_z_bytecap_c_(size) _Pre3_impl_(_$notnull, _$zterm,_$bytecap_c(size)) _Deref_pre1_impl_(_$valid) +#define _Pre_opt_z_bytecap_c_(size) _Pre3_impl_(_$maybenull,_$zterm,_$bytecap_c(size)) _Deref_pre1_impl_(_$valid) + +#define _Pre_z_cap_x_(size) _Pre3_impl_(_$notnull, _$zterm,_$cap_x(size)) _Deref_pre1_impl_(_$valid) +#define _Pre_opt_z_cap_x_(size) _Pre3_impl_(_$maybenull,_$zterm,_$cap_x(size)) _Deref_pre1_impl_(_$valid) +#define _Pre_z_bytecap_x_(size) _Pre3_impl_(_$notnull, _$zterm,_$bytecap_x(size)) _Deref_pre1_impl_(_$valid) +#define _Pre_opt_z_bytecap_x_(size) _Pre3_impl_(_$maybenull,_$zterm,_$bytecap_x(size)) _Deref_pre1_impl_(_$valid) + +// known capacity and valid but unknown readable extent +#define _Pre_valid_cap_(size) _Pre2_impl_(_$notnull, _$cap(size)) _Deref_pre1_impl_(_$valid) +#define _Pre_opt_valid_cap_(size) _Pre2_impl_(_$maybenull,_$cap(size)) _Deref_pre1_impl_(_$valid) +#define _Pre_valid_bytecap_(size) _Pre2_impl_(_$notnull, _$bytecap(size)) _Deref_pre1_impl_(_$valid) +#define _Pre_opt_valid_bytecap_(size) _Pre2_impl_(_$maybenull,_$bytecap(size)) _Deref_pre1_impl_(_$valid) + +#define _Pre_valid_cap_c_(size) _Pre2_impl_(_$notnull, _$cap_c(size)) _Deref_pre1_impl_(_$valid) +#define _Pre_opt_valid_cap_c_(size) _Pre2_impl_(_$maybenull,_$cap_c(size)) _Deref_pre1_impl_(_$valid) +#define _Pre_valid_bytecap_c_(size) _Pre2_impl_(_$notnull, _$bytecap_c(size)) _Deref_pre1_impl_(_$valid) +#define _Pre_opt_valid_bytecap_c_(size) _Pre2_impl_(_$maybenull,_$bytecap_c(size)) _Deref_pre1_impl_(_$valid) + +#define _Pre_valid_cap_x_(size) _Pre2_impl_(_$notnull, _$cap_x(size)) _Deref_pre1_impl_(_$valid) +#define _Pre_opt_valid_cap_x_(size) _Pre2_impl_(_$maybenull,_$cap_x(size)) _Deref_pre1_impl_(_$valid) +#define _Pre_valid_bytecap_x_(size) _Pre2_impl_(_$notnull, _$bytecap_x(size)) _Deref_pre1_impl_(_$valid) +#define _Pre_opt_valid_bytecap_x_(size) _Pre2_impl_(_$maybenull,_$bytecap_x(size)) _Deref_pre1_impl_(_$valid) + +// e.g. void AppendCharRange( _Pre_count_(cchFrom) const char* rgFrom, size_t cchFrom, _Out_z_cap_(cchTo) char* szTo, size_t cchTo ); +// Valid buffer extent described by another parameter +#define _Pre_count_(size) _Pre2_impl_(_$notnull, _$count(size)) _Deref_pre1_impl_(_$valid) +#define _Pre_opt_count_(size) _Pre2_impl_(_$maybenull,_$count(size)) _Deref_pre1_impl_(_$valid) +#define _Pre_bytecount_(size) _Pre2_impl_(_$notnull, _$bytecount(size)) _Deref_pre1_impl_(_$valid) +#define _Pre_opt_bytecount_(size) _Pre2_impl_(_$maybenull,_$bytecount(size)) _Deref_pre1_impl_(_$valid) + +// Valid buffer extent described by a constant expression +#define _Pre_count_c_(size) _Pre2_impl_(_$notnull, _$count_c(size)) _Deref_pre1_impl_(_$valid) +#define _Pre_opt_count_c_(size) _Pre2_impl_(_$maybenull,_$count_c(size)) _Deref_pre1_impl_(_$valid) +#define _Pre_bytecount_c_(size) _Pre2_impl_(_$notnull, _$bytecount_c(size)) _Deref_pre1_impl_(_$valid) +#define _Pre_opt_bytecount_c_(size) _Pre2_impl_(_$maybenull,_$bytecount_c(size)) _Deref_pre1_impl_(_$valid) + +// Valid buffer extent described by a complex expression +#define _Pre_count_x_(size) _Pre2_impl_(_$notnull, _$count_x(size)) _Deref_pre1_impl_(_$valid) +#define _Pre_opt_count_x_(size) _Pre2_impl_(_$maybenull,_$count_x(size)) _Deref_pre1_impl_(_$valid) +#define _Pre_bytecount_x_(size) _Pre2_impl_(_$notnull, _$bytecount_x(size)) _Deref_pre1_impl_(_$valid) +#define _Pre_opt_bytecount_x_(size) _Pre2_impl_(_$maybenull,_$bytecount_x(size)) _Deref_pre1_impl_(_$valid) + +// Valid buffer extent described by the difference to another pointer parameter +#define _Pre_ptrdiff_count_(ptr) _Pre2_impl_(_$notnull, _$count_x(__ptrdiff(ptr))) _Deref_pre1_impl_(_$valid) +#define _Pre_opt_ptrdiff_count_(ptr) _Pre2_impl_(_$maybenull,_$count_x(__ptrdiff(ptr))) _Deref_pre1_impl_(_$valid) + +// valid size unknown or indicated by type (e.g.:LPSTR) +#define _Pre_valid_ _Pre1_impl_(_$notnull) _Deref_pre1_impl_(_$valid) +#define _Pre_opt_valid_ _Pre1_impl_(_$maybenull) _Deref_pre1_impl_(_$valid) + +#define _Pre_invalid_ _Deref_pre1_impl_(_$notvalid) + +// used with allocated but not yet initialized objects +#define _Pre_notnull_ _Pre1_impl_(_$notnull) +#define _Pre_maybenull_ _Pre1_impl_(_$maybenull) +#define _Pre_null_ _Pre1_impl_(_$null) + +// restrict access rights +#define _Pre_readonly_ _Pre1_impl_(_$readaccess) +#define _Pre_writeonly_ _Pre1_impl_(_$writeaccess) +// +// _Post_ annotations --- +// +// describing conditions that hold after the function call + +// void CopyStr( _In_z_ const char* szFrom, _Pre_cap_(cch) _Post_z_ char* szFrom, size_t cchFrom ); +// buffer will be a zero-terminated string after the call +#define _Post_z_ _Post1_impl_(_$zterm) _Deref_post1_impl_(_$valid) + +// char * strncpy(_Out_cap_(_Count) _Post_maybez_ char * _Dest, _In_z_ const char * _Source, _In_ size_t _Count) +// buffer maybe zero-terminated after the call +#define _Post_maybez_ _Post1_impl_(_$maybezterm) + +// e.g. SIZE_T HeapSize( _In_ HANDLE hHeap, DWORD dwFlags, _Pre_notnull_ _Post_bytecap_(return) LPCVOID lpMem ); +#define _Post_cap_(size) _Post1_impl_(_$cap(size)) +#define _Post_bytecap_(size) _Post1_impl_(_$bytecap(size)) + +// e.g. int strlen( _In_z_ _Post_count_(return+1) const char* sz ); +#define _Post_count_(size) _Post1_impl_(_$count(size)) _Deref_post1_impl_(_$valid) +#define _Post_bytecount_(size) _Post1_impl_(_$bytecount(size)) _Deref_post1_impl_(_$valid) +#define _Post_count_c_(size) _Post1_impl_(_$count_c(size)) _Deref_post1_impl_(_$valid) +#define _Post_bytecount_c_(size) _Post1_impl_(_$bytecount_c(size)) _Deref_post1_impl_(_$valid) +#define _Post_count_x_(size) _Post1_impl_(_$count_x(size)) _Deref_post1_impl_(_$valid) +#define _Post_bytecount_x_(size) _Post1_impl_(_$bytecount_x(size)) _Deref_post1_impl_(_$valid) + +// e.g. size_t CopyStr( _In_z_ const char* szFrom, _Pre_cap_(cch) _Post_z_count_(return+1) char* szFrom, size_t cchFrom ); +#define _Post_z_count_(size) _Post2_impl_(_$zterm,_$count(size)) _Deref_post1_impl_(_$valid) +#define _Post_z_bytecount_(size) _Post2_impl_(_$zterm,_$bytecount(size)) _Deref_post1_impl_(_$valid) +#define _Post_z_count_c_(size) _Post2_impl_(_$zterm,_$count_c(size)) _Deref_post1_impl_(_$valid) +#define _Post_z_bytecount_c_(size) _Post2_impl_(_$zterm,_$bytecount_c(size)) _Deref_post1_impl_(_$valid) +#define _Post_z_count_x_(size) _Post2_impl_(_$zterm,_$count_x(size)) _Deref_post1_impl_(_$valid) +#define _Post_z_bytecount_x_(size) _Post2_impl_(_$zterm,_$bytecount_x(size)) _Deref_post1_impl_(_$valid) + +// e.g. void free( _Post_ptr_invalid_ void* pv ); +#define _Post_ptr_invalid_ _Post1_impl_(_$notvalid) + +// e.g. HRESULT InitStruct( _Post_valid_ Struct* pobj ); +#define _Post_valid_ _Deref_post1_impl_(_$valid) +#define _Post_invalid_ _Deref_post1_impl_(_$notvalid) + +// e.g. void ThrowExceptionIfNull( _Post_notnull_ const void* pv ); +#define _Post_notnull_ _Post1_impl_(_$notnull) + +// +// _Ret_ annotations +// +// describing conditions that hold for return values after the call + +// e.g. _Ret_z_ CString::operator const wchar_t*() const throw(); +#define _Ret_z_ _Ret2_impl_(_$notnull, _$zterm) _Deref_ret1_impl_(_$valid) +#define _Ret_opt_z_ _Ret2_impl_(_$maybenull,_$zterm) _Deref_ret1_impl_(_$valid) + +// e.g. _Ret_opt_bytecap_(cb) void* AllocateMemory( size_t cb ); +// Buffer capacity is described by another parameter +#define _Ret_cap_(size) _Ret2_impl_(_$notnull, _$cap(size)) +#define _Ret_opt_cap_(size) _Ret2_impl_(_$maybenull,_$cap(size)) +#define _Ret_bytecap_(size) _Ret2_impl_(_$notnull, _$bytecap(size)) +#define _Ret_opt_bytecap_(size) _Ret2_impl_(_$maybenull,_$bytecap(size)) + +// Buffer capacity is described by a constant expression +#define _Ret_cap_c_(size) _Ret2_impl_(_$notnull, _$cap_c(size)) +#define _Ret_opt_cap_c_(size) _Ret2_impl_(_$maybenull,_$cap_c(size)) +#define _Ret_bytecap_c_(size) _Ret2_impl_(_$notnull, _$bytecap_c(size)) +#define _Ret_opt_bytecap_c_(size) _Ret2_impl_(_$maybenull,_$bytecap_c(size)) + +// Buffer capacity is described by a complex condition +#define _Ret_cap_x_(size) _Ret2_impl_(_$notnull, _$cap_x(size)) +#define _Ret_opt_cap_x_(size) _Ret2_impl_(_$maybenull,_$cap_x(size)) +#define _Ret_bytecap_x_(size) _Ret2_impl_(_$notnull, _$bytecap_x(size)) +#define _Ret_opt_bytecap_x_(size) _Ret2_impl_(_$maybenull,_$bytecap_x(size)) + +// return value is nullterminated and capacity is given by another parameter +#define _Ret_z_cap_(size) _Ret3_impl_(_$notnull, _$zterm,_$cap(size)) _Deref_ret1_impl_(_$valid) +#define _Ret_opt_z_cap_(size) _Ret3_impl_(_$maybenull,_$zterm,_$cap(size)) _Deref_ret1_impl_(_$valid) +#define _Ret_z_bytecap_(size) _Ret3_impl_(_$notnull, _$zterm,_$bytecap(size)) _Deref_ret1_impl_(_$valid) +#define _Ret_opt_z_bytecap_(size) _Ret3_impl_(_$maybenull,_$zterm,_$bytecap(size)) _Deref_ret1_impl_(_$valid) + +// e.g. _Ret_opt_bytecount_(cb) void* AllocateZeroInitializedMemory( size_t cb ); +// Valid Buffer extent is described by another parameter +#define _Ret_count_(size) _Ret2_impl_(_$notnull, _$count(size)) _Deref_ret1_impl_(_$valid) +#define _Ret_opt_count_(size) _Ret2_impl_(_$maybenull,_$count(size)) _Deref_ret1_impl_(_$valid) +#define _Ret_bytecount_(size) _Ret2_impl_(_$notnull, _$bytecount(size)) _Deref_ret1_impl_(_$valid) +#define _Ret_opt_bytecount_(size) _Ret2_impl_(_$maybenull,_$bytecount(size)) _Deref_ret1_impl_(_$valid) + +// Valid Buffer extent is described by a constant expression +#define _Ret_count_c_(size) _Ret2_impl_(_$notnull, _$count_c(size)) _Deref_ret1_impl_(_$valid) +#define _Ret_opt_count_c_(size) _Ret2_impl_(_$maybenull,_$count_c(size)) _Deref_ret1_impl_(_$valid) +#define _Ret_bytecount_c_(size) _Ret2_impl_(_$notnull, _$bytecount_c(size)) _Deref_ret1_impl_(_$valid) +#define _Ret_opt_bytecount_c_(size) _Ret2_impl_(_$maybenull,_$bytecount_c(size)) _Deref_ret1_impl_(_$valid) + +// Valid Buffer extent is described by a complex expression +#define _Ret_count_x_(size) _Ret2_impl_(_$notnull, _$count_x(size)) _Deref_ret1_impl_(_$valid) +#define _Ret_opt_count_x_(size) _Ret2_impl_(_$maybenull,_$count_x(size)) _Deref_ret1_impl_(_$valid) +#define _Ret_bytecount_x_(size) _Ret2_impl_(_$notnull, _$bytecount_x(size)) _Deref_ret1_impl_(_$valid) +#define _Ret_opt_bytecount_x_(size) _Ret2_impl_(_$maybenull,_$bytecount_x(size)) _Deref_ret1_impl_(_$valid) + +// return value is nullterminated and length is given by another parameter +#define _Ret_z_count_(size) _Ret3_impl_(_$notnull, _$zterm,_$count(size)) _Deref_ret1_impl_(_$valid) +#define _Ret_opt_z_count_(size) _Ret3_impl_(_$maybenull,_$zterm,_$count(size)) _Deref_ret1_impl_(_$valid) +#define _Ret_z_bytecount_(size) _Ret3_impl_(_$notnull, _$zterm,_$bytecount(size)) _Deref_ret1_impl_(_$valid) +#define _Ret_opt_z_bytecount_(size) _Ret3_impl_(_$maybenull,_$zterm,_$bytecount(size)) _Deref_ret1_impl_(_$valid) + +// e.g. _Ret_opt_valid_ LPSTR void* CloneSTR( _Pre_valid_ LPSTR src ); +#define _Ret_valid_ _Ret1_impl_(_$notnull) _Deref_ret1_impl_(_$valid) +#define _Ret_opt_valid_ _Ret1_impl_(_$maybenull) _Deref_ret1_impl_(_$valid) + +// used with allocated but not yet initialized objects +#define _Ret_notnull_ _Ret1_impl_(_$notnull) +#define _Ret_maybenull_ _Ret1_impl_(_$maybenull) +#define _Ret_null_ _Ret1_impl_(_$null) + +// +// _Deref_pre_ --- +// +// describing conditions for array elements of dereferenced pointer parameters that must be met before the call + +// e.g. void SaveStringArray( _In_count_(cStrings) _Deref_pre_z_ const wchar_t* const rgpwch[] ); +#define _Deref_pre_z_ _Deref_pre2_impl_(_$notnull, _$zterm) _Deref2_pre1_impl_(_$valid) +#define _Deref_pre_opt_z_ _Deref_pre2_impl_(_$maybenull,_$zterm) _Deref2_pre1_impl_(_$valid) + +// e.g. void FillInArrayOfStr32( _In_count_(cStrings) _Deref_pre_cap_c_(32) _Deref_post_z_ wchar_t* const rgpwch[] ); +// buffer capacity is described by another parameter +#define _Deref_pre_cap_(size) _Deref_pre2_impl_(_$notnull, _$cap(size)) +#define _Deref_pre_opt_cap_(size) _Deref_pre2_impl_(_$maybenull,_$cap(size)) +#define _Deref_pre_bytecap_(size) _Deref_pre2_impl_(_$notnull, _$bytecap(size)) +#define _Deref_pre_opt_bytecap_(size) _Deref_pre2_impl_(_$maybenull,_$bytecap(size)) + +// buffer capacity is described by a constant expression +#define _Deref_pre_cap_c_(size) _Deref_pre2_impl_(_$notnull, _$cap_c(size)) +#define _Deref_pre_opt_cap_c_(size) _Deref_pre2_impl_(_$maybenull,_$cap_c(size)) +#define _Deref_pre_bytecap_c_(size) _Deref_pre2_impl_(_$notnull, _$bytecap_c(size)) +#define _Deref_pre_opt_bytecap_c_(size) _Deref_pre2_impl_(_$maybenull,_$bytecap_c(size)) + +// buffer capacity is described by a complex condition +#define _Deref_pre_cap_x_(size) _Deref_pre2_impl_(_$notnull, _$cap_x(size)) +#define _Deref_pre_opt_cap_x_(size) _Deref_pre2_impl_(_$maybenull,_$cap_x(size)) +#define _Deref_pre_bytecap_x_(size) _Deref_pre2_impl_(_$notnull, _$bytecap_x(size)) +#define _Deref_pre_opt_bytecap_x_(size) _Deref_pre2_impl_(_$maybenull,_$bytecap_x(size)) + +// convenience macros for nullterminated buffers with given capacity +#define _Deref_pre_z_cap_(size) _Deref_pre3_impl_(_$notnull, _$zterm,_$cap(size)) _Deref2_pre1_impl_(_$valid) +#define _Deref_pre_opt_z_cap_(size) _Deref_pre3_impl_(_$maybenull,_$zterm,_$cap(size)) _Deref2_pre1_impl_(_$valid) +#define _Deref_pre_z_bytecap_(size) _Deref_pre3_impl_(_$notnull, _$zterm,_$bytecap(size)) _Deref2_pre1_impl_(_$valid) +#define _Deref_pre_opt_z_bytecap_(size) _Deref_pre3_impl_(_$maybenull,_$zterm,_$bytecap(size)) _Deref2_pre1_impl_(_$valid) + +#define _Deref_pre_z_cap_c_(size) _Deref_pre3_impl_(_$notnull, _$zterm,_$cap_c(size)) _Deref2_pre1_impl_(_$valid) +#define _Deref_pre_opt_z_cap_c_(size) _Deref_pre3_impl_(_$maybenull,_$zterm,_$cap_c(size)) _Deref2_pre1_impl_(_$valid) +#define _Deref_pre_z_bytecap_c_(size) _Deref_pre3_impl_(_$notnull, _$zterm,_$bytecap_c(size)) _Deref2_pre1_impl_(_$valid) +#define _Deref_pre_opt_z_bytecap_c_(size) _Deref_pre3_impl_(_$maybenull,_$zterm,_$bytecap_c(size)) _Deref2_pre1_impl_(_$valid) + +#define _Deref_pre_z_cap_x_(size) _Deref_pre3_impl_(_$notnull, _$zterm,_$cap_x(size)) _Deref2_pre1_impl_(_$valid) +#define _Deref_pre_opt_z_cap_x_(size) _Deref_pre3_impl_(_$maybenull,_$zterm,_$cap_x(size)) _Deref2_pre1_impl_(_$valid) +#define _Deref_pre_z_bytecap_x_(size) _Deref_pre3_impl_(_$notnull, _$zterm,_$bytecap_x(size)) _Deref2_pre1_impl_(_$valid) +#define _Deref_pre_opt_z_bytecap_x_(size) _Deref_pre3_impl_(_$maybenull,_$zterm,_$bytecap_x(size)) _Deref2_pre1_impl_(_$valid) + +// known capacity and valid but unknown readable extent +#define _Deref_pre_valid_cap_(size) _Deref_pre2_impl_(_$notnull, _$cap(size)) _Deref2_pre1_impl_(_$valid) +#define _Deref_pre_opt_valid_cap_(size) _Deref_pre2_impl_(_$maybenull,_$cap(size)) _Deref2_pre1_impl_(_$valid) +#define _Deref_pre_valid_bytecap_(size) _Deref_pre2_impl_(_$notnull, _$bytecap(size)) _Deref2_pre1_impl_(_$valid) +#define _Deref_pre_opt_valid_bytecap_(size) _Deref_pre2_impl_(_$maybenull,_$bytecap(size)) _Deref2_pre1_impl_(_$valid) + +#define _Deref_pre_valid_cap_c_(size) _Deref_pre2_impl_(_$notnull, _$cap_c(size)) _Deref2_pre1_impl_(_$valid) +#define _Deref_pre_opt_valid_cap_c_(size) _Deref_pre2_impl_(_$maybenull,_$cap_c(size)) _Deref2_pre1_impl_(_$valid) +#define _Deref_pre_valid_bytecap_c_(size) _Deref_pre2_impl_(_$notnull, _$bytecap_c(size)) _Deref2_pre1_impl_(_$valid) +#define _Deref_pre_opt_valid_bytecap_c_(size) _Deref_pre2_impl_(_$maybenull,_$bytecap_c(size)) _Deref2_pre1_impl_(_$valid) + +#define _Deref_pre_valid_cap_x_(size) _Deref_pre2_impl_(_$notnull, _$cap_x(size)) _Deref2_pre1_impl_(_$valid) +#define _Deref_pre_opt_valid_cap_x_(size) _Deref_pre2_impl_(_$maybenull,_$cap_x(size)) _Deref2_pre1_impl_(_$valid) +#define _Deref_pre_valid_bytecap_x_(size) _Deref_pre2_impl_(_$notnull, _$bytecap_x(size)) _Deref2_pre1_impl_(_$valid) +#define _Deref_pre_opt_valid_bytecap_x_(size) _Deref_pre2_impl_(_$maybenull,_$bytecap_x(size)) _Deref2_pre1_impl_(_$valid) + +// e.g. void SaveMatrix( _In_count_(n) _Deref_pre_count_(n) const Elem** matrix, size_t n ); +// valid buffer extent is described by another parameter +#define _Deref_pre_count_(size) _Deref_pre2_impl_(_$notnull, _$count(size)) _Deref2_pre1_impl_(_$valid) +#define _Deref_pre_opt_count_(size) _Deref_pre2_impl_(_$maybenull,_$count(size)) _Deref2_pre1_impl_(_$valid) +#define _Deref_pre_bytecount_(size) _Deref_pre2_impl_(_$notnull, _$bytecount(size)) _Deref2_pre1_impl_(_$valid) +#define _Deref_pre_opt_bytecount_(size) _Deref_pre2_impl_(_$maybenull,_$bytecount(size)) _Deref2_pre1_impl_(_$valid) + +// valid buffer extent is described by a constant expression +#define _Deref_pre_count_c_(size) _Deref_pre2_impl_(_$notnull, _$count_c(size)) _Deref2_pre1_impl_(_$valid) +#define _Deref_pre_opt_count_c_(size) _Deref_pre2_impl_(_$maybenull,_$count_c(size)) _Deref2_pre1_impl_(_$valid) +#define _Deref_pre_bytecount_c_(size) _Deref_pre2_impl_(_$notnull, _$bytecount_c(size)) _Deref2_pre1_impl_(_$valid) +#define _Deref_pre_opt_bytecount_c_(size) _Deref_pre2_impl_(_$maybenull,_$bytecount_c(size)) _Deref2_pre1_impl_(_$valid) + +// valid buffer extent is described by a complex expression +#define _Deref_pre_count_x_(size) _Deref_pre2_impl_(_$notnull, _$count_x(size)) _Deref2_pre1_impl_(_$valid) +#define _Deref_pre_opt_count_x_(size) _Deref_pre2_impl_(_$maybenull,_$count_x(size)) _Deref2_pre1_impl_(_$valid) +#define _Deref_pre_bytecount_x_(size) _Deref_pre2_impl_(_$notnull, _$bytecount_x(size)) _Deref2_pre1_impl_(_$valid) +#define _Deref_pre_opt_bytecount_x_(size) _Deref_pre2_impl_(_$maybenull,_$bytecount_x(size)) _Deref2_pre1_impl_(_$valid) + +// e.g. void PrintStringArray( _In_count_(cElems) _Deref_pre_valid_ LPCSTR rgStr[], size_t cElems ); +#define _Deref_pre_valid_ _Deref_pre1_impl_(_$notnull) _Deref2_pre1_impl_(_$valid) +#define _Deref_pre_opt_valid_ _Deref_pre1_impl_(_$maybenull) _Deref2_pre1_impl_(_$valid) +#define _Deref_pre_invalid_ _Deref2_pre1_impl_(_$notvalid) + +#define _Deref_pre_notnull_ _Deref_pre1_impl_(_$notnull) +#define _Deref_pre_maybenull_ _Deref_pre1_impl_(_$maybenull) +#define _Deref_pre_null_ _Deref_pre1_impl_(_$null) + +// restrict access rights +#define _Deref_pre_readonly_ _Deref_pre1_impl_(_$readaccess) +#define _Deref_pre_writeonly_ _Deref_pre1_impl_(_$writeaccess) + +// +// _Deref_post_ --- +// +// describing conditions for array elements or dereferenced pointer parameters that hold after the call + +// e.g. void CloneString( _In_z_ const Wchar_t* wzIn _Out_ _Deref_post_z_ wchar_t** pWzOut ); +#define _Deref_post_z_ _Deref_post2_impl_(_$notnull, _$zterm) _Deref2_post1_impl_(_$valid) +#define _Deref_post_opt_z_ _Deref_post2_impl_(_$maybenull,_$zterm) _Deref2_post1_impl_(_$valid) + +// e.g. HRESULT HrAllocateMemory( size_t cb, _Out_ _Deref_post_bytecap_(cb) void** ppv ); +// buffer capacity is described by another parameter +#define _Deref_post_cap_(size) _Deref_post2_impl_(_$notnull, _$cap(size)) +#define _Deref_post_opt_cap_(size) _Deref_post2_impl_(_$maybenull,_$cap(size)) +#define _Deref_post_bytecap_(size) _Deref_post2_impl_(_$notnull, _$bytecap(size)) +#define _Deref_post_opt_bytecap_(size) _Deref_post2_impl_(_$maybenull,_$bytecap(size)) + +// buffer capacity is described by a constant expression +#define _Deref_post_cap_c_(size) _Deref_post2_impl_(_$notnull, _$cap_z(size)) +#define _Deref_post_opt_cap_c_(size) _Deref_post2_impl_(_$maybenull,_$cap_z(size)) +#define _Deref_post_bytecap_c_(size) _Deref_post2_impl_(_$notnull, _$bytecap_z(size)) +#define _Deref_post_opt_bytecap_c_(size) _Deref_post2_impl_(_$maybenull,_$bytecap_z(size)) + +// buffer capacity is described by a complex expression +#define _Deref_post_cap_x_(size) _Deref_post2_impl_(_$notnull, _$cap_x(size)) +#define _Deref_post_opt_cap_x_(size) _Deref_post2_impl_(_$maybenull,_$cap_x(size)) +#define _Deref_post_bytecap_x_(size) _Deref_post2_impl_(_$notnull, _$bytecap_x(size)) +#define _Deref_post_opt_bytecap_x_(size) _Deref_post2_impl_(_$maybenull,_$bytecap_x(size)) + +// convenience macros for nullterminated buffers with given capacity +#define _Deref_post_z_cap_(size) _Deref_post3_impl_(_$notnull, _$zterm,_$cap(size)) _Deref2_post1_impl_(_$valid) +#define _Deref_post_opt_z_cap_(size) _Deref_post3_impl_(_$maybenull,_$zterm,_$cap(size)) _Deref2_post1_impl_(_$valid) +#define _Deref_post_z_bytecap_(size) _Deref_post3_impl_(_$notnull, _$zterm,_$bytecap(size)) _Deref2_post1_impl_(_$valid) +#define _Deref_post_opt_z_bytecap_(size) _Deref_post3_impl_(_$maybenull,_$zterm,_$bytecap(size)) _Deref2_post1_impl_(_$valid) + +#define _Deref_post_z_cap_c_(size) _Deref_post3_impl_(_$notnull, _$zterm,_$cap_c(size)) _Deref2_post1_impl_(_$valid) +#define _Deref_post_opt_z_cap_c_(size) _Deref_post3_impl_(_$maybenull,_$zterm,_$cap_c(size)) _Deref2_post1_impl_(_$valid) +#define _Deref_post_z_bytecap_c_(size) _Deref_post3_impl_(_$notnull, _$zterm,_$bytecap_c(size)) _Deref2_post1_impl_(_$valid) +#define _Deref_post_opt_z_bytecap_c_(size) _Deref_post3_impl_(_$maybenull,_$zterm,_$bytecap_c(size)) _Deref2_post1_impl_(_$valid) + +#define _Deref_post_z_cap_x_(size) _Deref_post3_impl_(_$notnull, _$zterm,_$cap_x(size)) _Deref2_post1_impl_(_$valid) +#define _Deref_post_opt_z_cap_x_(size) _Deref_post3_impl_(_$maybenull,_$zterm,_$cap_x(size)) _Deref2_post1_impl_(_$valid) +#define _Deref_post_z_bytecap_x_(size) _Deref_post3_impl_(_$notnull, _$zterm,_$bytecap_x(size)) _Deref2_post1_impl_(_$valid) +#define _Deref_post_opt_z_bytecap_x_(size) _Deref_post3_impl_(_$maybenull,_$zterm,_$bytecap_x(size)) _Deref2_post1_impl_(_$valid) + +// known capacity and valid but unknown readable extent +#define _Deref_post_valid_cap_(size) _Deref_post2_impl_(_$notnull, _$cap(size)) _Deref2_post1_impl_(_$valid) +#define _Deref_post_opt_valid_cap_(size) _Deref_post2_impl_(_$maybenull,_$cap(size)) _Deref2_post1_impl_(_$valid) +#define _Deref_post_valid_bytecap_(size) _Deref_post2_impl_(_$notnull, _$bytecap(size)) _Deref2_post1_impl_(_$valid) +#define _Deref_post_opt_valid_bytecap_(size) _Deref_post2_impl_(_$maybenull,_$bytecap(size)) _Deref2_post1_impl_(_$valid) + +#define _Deref_post_valid_cap_c_(size) _Deref_post2_impl_(_$notnull, _$cap_c(size)) _Deref2_post1_impl_(_$valid) +#define _Deref_post_opt_valid_cap_c_(size) _Deref_post2_impl_(_$maybenull,_$cap_c(size)) _Deref2_post1_impl_(_$valid) +#define _Deref_post_valid_bytecap_c_(size) _Deref_post2_impl_(_$notnull, _$bytecap_c(size)) _Deref2_post1_impl_(_$valid) +#define _Deref_post_opt_valid_bytecap_c_(size) _Deref_post2_impl_(_$maybenull,_$bytecap_c(size)) _Deref2_post1_impl_(_$valid) + +#define _Deref_post_valid_cap_x_(size) _Deref_post2_impl_(_$notnull, _$cap_x(size)) _Deref2_post1_impl_(_$valid) +#define _Deref_post_opt_valid_cap_x_(size) _Deref_post2_impl_(_$maybenull,_$cap_x(size)) _Deref2_post1_impl_(_$valid) +#define _Deref_post_valid_bytecap_x_(size) _Deref_post2_impl_(_$notnull, _$bytecap_x(size)) _Deref2_post1_impl_(_$valid) +#define _Deref_post_opt_valid_bytecap_x_(size) _Deref_post2_impl_(_$maybenull,_$bytecap_x(size)) _Deref2_post1_impl_(_$valid) + +// e.g. HRESULT HrAllocateZeroInitializedMemory( size_t cb, _Out_ _Deref_post_bytecount_(cb) void** ppv ); +// valid buffer extent is described by another parameter +#define _Deref_post_count_(size) _Deref_post2_impl_(_$notnull, _$count(size)) _Deref2_post1_impl_(_$valid) +#define _Deref_post_opt_count_(size) _Deref_post2_impl_(_$maybenull,_$count(size)) _Deref2_post1_impl_(_$valid) +#define _Deref_post_bytecount_(size) _Deref_post2_impl_(_$notnull, _$bytecount(size)) _Deref2_post1_impl_(_$valid) +#define _Deref_post_opt_bytecount_(size) _Deref_post2_impl_(_$maybenull,_$bytecount(size)) _Deref2_post1_impl_(_$valid) + +// buffer capacity is described by a constant expression +#define _Deref_post_count_c_(size) _Deref_post2_impl_(_$notnull, _$count_c(size)) _Deref2_post1_impl_(_$valid) +#define _Deref_post_opt_count_c_(size) _Deref_post2_impl_(_$maybenull,_$count_c(size)) _Deref2_post1_impl_(_$valid) +#define _Deref_post_bytecount_c_(size) _Deref_post2_impl_(_$notnull, _$bytecount_c(size)) _Deref2_post1_impl_(_$valid) +#define _Deref_post_opt_bytecount_c_(size) _Deref_post2_impl_(_$maybenull,_$bytecount_c(size)) _Deref2_post1_impl_(_$valid) + +// buffer capacity is described by a complex expression +#define _Deref_post_count_x_(size) _Deref_post2_impl_(_$notnull, _$count_x(size)) _Deref2_post1_impl_(_$valid) +#define _Deref_post_opt_count_x_(size) _Deref_post2_impl_(_$maybenull,_$count_x(size)) _Deref2_post1_impl_(_$valid) +#define _Deref_post_bytecount_x_(size) _Deref_post2_impl_(_$notnull, _$bytecount_x(size)) _Deref2_post1_impl_(_$valid) +#define _Deref_post_opt_bytecount_x_(size) _Deref_post2_impl_(_$maybenull,_$bytecount_x(size)) _Deref2_post1_impl_(_$valid) + +// e.g. void GetStrings( _Out_count_(cElems) _Deref_post_valid_ LPSTR const rgStr[], size_t cElems ); +#define _Deref_post_valid_ _Deref_post1_impl_(_$notnull) _Deref2_post1_impl_(_$valid) +#define _Deref_post_opt_valid_ _Deref_post1_impl_(_$maybenull) _Deref2_post1_impl_(_$valid) + +#define _Deref_post_notnull_ _Deref_post1_impl_(_$notnull) +#define _Deref_post_maybenull_ _Deref_post1_impl_(_$maybenull) +#define _Deref_post_null_ _Deref_post1_impl_(_$null) + +// +// _Deref_ret_ --- +// + +#define _Deref_ret_z_ _Deref_ret2_impl_(_$notnull, _$zterm) +#define _Deref_ret_opt_z_ _Deref_ret2_impl_(_$maybenull,_$zterm) + +// +// special _Deref_ --- +// +#define _Deref2_pre_readonly_ _Deref2_pre1_impl_(_$readaccess) + +// Convenience macros for more concise annotations + +// +// _Pre_post --- +// +// describing conditions that hold before and after the function call + +#define _Prepost_z_ _Pre_z_ _Post_z_ +#define _Prepost_opt_z_ _Pre_opt_z_ _Post_z_ + +#define _Prepost_count_(size) _Pre_count_(size) _Post_count_(size) +#define _Prepost_opt_count_(size) _Pre_opt_count_(size) _Post_count_(size) +#define _Prepost_bytecount_(size) _Pre_bytecount_(size) _Post_bytecount_(size) +#define _Prepost_opt_bytecount_(size) _Pre_opt_bytecount_(size) _Post_bytecount_(size) +#define _Prepost_count_c_(size) _Pre_count_c_(size) _Post_count_c_(size) +#define _Prepost_opt_count_c_(size) _Pre_opt_count_c_(size) _Post_count_c_(size) +#define _Prepost_bytecount_c_(size) _Pre_bytecount_c_(size) _Post_bytecount_c_(size) +#define _Prepost_opt_bytecount_c_(size) _Pre_opt_bytecount_c_(size) _Post_bytecount_c_(size) +#define _Prepost_count_x_(size) _Pre_count_x_(size) _Post_count_x_(size) +#define _Prepost_opt_count_x_(size) _Pre_opt_count_x_(size) _Post_count_x_(size) +#define _Prepost_bytecount_x_(size) _Pre_bytecount_x_(size) _Post_bytecount_x_(size) +#define _Prepost_opt_bytecount_x_(size) _Pre_opt_bytecount_x_(size) _Post_bytecount_x_(size) + +#define _Prepost_valid_ _Pre_valid_ _Post_valid_ +#define _Prepost_opt_valid_ _Pre_opt_valid_ _Post_valid_ + +// +// _Deref_<both> --- +// +// short version for _Deref_pre_<ann> _Deref_post_<ann> +// describing conditions for array elements or dereferenced pointer parameters that hold before and after the call + +#define _Deref_prepost_z_ _Deref_pre_z_ _Deref_post_z_ +#define _Deref_prepost_opt_z_ _Deref_pre_opt_z_ _Deref_post_opt_z_ + +#define _Deref_prepost_cap_(size) _Deref_pre_cap_(size) _Deref_post_cap_(size) +#define _Deref_prepost_opt_cap_(size) _Deref_pre_opt_cap_(size) _Deref_post_opt_cap_(size) +#define _Deref_prepost_bytecap_(size) _Deref_pre_bytecap_(size) _Deref_post_bytecap_(size) +#define _Deref_prepost_opt_bytecap_(size) _Deref_pre_opt_bytecap_(size) _Deref_post_opt_bytecap_(size) + +#define _Deref_prepost_cap_x_(size) _Deref_pre_cap_x_(size) _Deref_post_cap_x_(size) +#define _Deref_prepost_opt_cap_x_(size) _Deref_pre_opt_cap_x_(size) _Deref_post_opt_cap_x_(size) +#define _Deref_prepost_bytecap_x_(size) _Deref_pre_bytecap_x_(size) _Deref_post_bytecap_x_(size) +#define _Deref_prepost_opt_bytecap_x_(size) _Deref_pre_opt_bytecap_x_(size) _Deref_post_opt_bytecap_x_(size) + +#define _Deref_prepost_z_cap_(size) _Deref_pre_z_cap_(size) _Deref_post_z_cap_(size) +#define _Deref_prepost_opt_z_cap_(size) _Deref_pre_opt_z_cap_(size) _Deref_post_opt_z_cap_(size) +#define _Deref_prepost_z_bytecap_(size) _Deref_pre_z_bytecap_(size) _Deref_post_z_bytecap_(size) +#define _Deref_prepost_opt_z_bytecap_(size) _Deref_pre_opt_z_bytecap_(size) _Deref_post_opt_z_bytecap_(size) + +#define _Deref_prepost_valid_cap_(size) _Deref_pre_valid_cap_(size) _Deref_post_valid_cap_(size) +#define _Deref_prepost_opt_valid_cap_(size) _Deref_pre_opt_valid_cap_(size) _Deref_post_opt_valid_cap_(size) +#define _Deref_prepost_valid_bytecap_(size) _Deref_pre_valid_bytecap_(size) _Deref_post_valid_bytecap_(size) +#define _Deref_prepost_opt_valid_bytecap_(size) _Deref_pre_opt_valid_bytecap_(size) _Deref_post_opt_valid_bytecap_(size) + +#define _Deref_prepost_valid_cap_x_(size) _Deref_pre_valid_cap_x_(size) _Deref_post_valid_cap_x_(size) +#define _Deref_prepost_opt_valid_cap_x_(size) _Deref_pre_opt_valid_cap_x_(size) _Deref_post_opt_valid_cap_x_(size) +#define _Deref_prepost_valid_bytecap_x_(size) _Deref_pre_valid_bytecap_x_(size) _Deref_post_valid_bytecap_x_(size) +#define _Deref_prepost_opt_valid_bytecap_x_(size) _Deref_pre_opt_valid_bytecap_x_(size) _Deref_post_opt_valid_bytecap_x_(size) + +#define _Deref_prepost_count_(size) _Deref_pre_count_(size) _Deref_post_count_(size) +#define _Deref_prepost_opt_count_(size) _Deref_pre_opt_count_(size) _Deref_post_opt_count_(size) +#define _Deref_prepost_bytecount_(size) _Deref_pre_bytecount_(size) _Deref_post_bytecount_(size) +#define _Deref_prepost_opt_bytecount_(size) _Deref_pre_opt_bytecount_(size) _Deref_post_opt_bytecount_(size) + +#define _Deref_prepost_count_x_(size) _Deref_pre_count_x_(size) _Deref_post_count_x_(size) +#define _Deref_prepost_opt_count_x_(size) _Deref_pre_opt_count_x_(size) _Deref_post_opt_count_x_(size) +#define _Deref_prepost_bytecount_x_(size) _Deref_pre_bytecount_x_(size) _Deref_post_bytecount_x_(size) +#define _Deref_prepost_opt_bytecount_x_(size) _Deref_pre_opt_bytecount_x_(size) _Deref_post_opt_bytecount_x_(size) + +#define _Deref_prepost_valid_ _Deref_pre_valid_ _Deref_post_valid_ +#define _Deref_prepost_opt_valid_ _Deref_pre_opt_valid_ _Deref_post_opt_valid_ + +// +// _Deref_<miscellaneous> +// +// used with references to arrays + +#define _Deref_out_z_cap_c_(size) _Deref_pre_cap_c_(size) _Deref_pre_invalid_ _Deref_post_z_ +#define _Deref_inout_z_cap_c_(size) _Deref_pre_z_cap_c_(size) _Deref_post_z_ +#define _Deref_out_z_bytecap_c_(size) _Deref_pre_bytecap_c_(size) _Deref_pre_invalid_ _Deref_post_z_ +#define _Deref_inout_z_bytecap_c_(size) _Deref_pre_z_bytecap_c_(size) _Deref_post_z_ +#define _Deref_inout_z_ _Deref_prepost_z_ + +//============================================================================ +// Implementation Layer: +//============================================================================ + +#if _USE_ATTRIBUTES_FOR_SAL + +#define _Check_return_impl_ [returnvalue:SA_Post(MustCheck=SA_Yes)] + +#define _Success_impl_(expr) [SA_Success(Condition=#expr)] + +#define _Printf_format_string_impl_ [SA_FormatString(Style="printf")] +#define _Scanf_format_string_impl_ [SA_FormatString(Style="scanf")] +#define _Scanf_s_format_string_impl_ [SA_FormatString(Style="scanf_s")] + +#define _In_bound_impl_ [SA_PreBound(Deref=0)] +#define _Out_bound_impl_ [SA_PostBound(Deref=0)] +#define _Ret_bound_impl_ [returnvalue:SA_PostBound(Deref=0)] +#define _Deref_in_bound_impl_ [SA_PreBound(Deref=1)] +#define _Deref_out_bound_impl_ [SA_PostBound(Deref=1)] +#define _Deref_ret_bound_impl_ [returnvalue:SA_PostBound(Deref=1)] + +#define _In_range_impl_(min,max) [SA_PreRange(MinVal=#min,MaxVal=#max)] +#define _Out_range_impl_(min,max) [SA_PostRange(MinVal=#min,MaxVal=#max)] +#define _Ret_range_impl_(min,max) [returnvalue:SA_PostRange(MinVal=#min,MaxVal=#max)] +#define _Deref_in_range_impl_(min,max) [SA_PreRange(Deref=1,MinVal=#min,MaxVal=#max)] +#define _Deref_out_range_impl_(min,max) [SA_PostRange(Deref=1,MinVal=#min,MaxVal=#max)] +#define _Deref_ret_range_impl_(min,max) [returnvalue:SA_PostRange(Deref=1,MinVal=#min,MaxVal=#max)] + +#define _$valid Valid=SA_Yes +#define _$maybevalid Valid=SA_Maybe +#define _$notvalid Valid=SA_No + +#define _$null Null=SA_Yes +#define _$maybenull Null=SA_Maybe +#define _$notnull Null=SA_No + +#define _$zterm NullTerminated=SA_Yes +#define _$maybezterm NullTerminated=SA_Maybe +#define _$notzterm NullTerminated=SA_No + +#define _$readaccess Access=SA_Read +#define _$writeaccess Access=SA_Write + +#define _$cap(size) WritableElements=#size +#define _$cap_c(size) WritableElementsConst=size +#define _$cap_for(param) WritableElementsLength=#param +#define _$cap_x(size) WritableElements="\n@"#size + +#define _$bytecap(size) WritableBytes=#size +#define _$bytecap_c(size) WritableBytesConst=size +#define _$bytecap_x(size) WritableBytes="\n@"#size + +#define _$mult(mult,size) ElementSizeConst=mult,_$cap(size) + +#define _$count(size) ValidElements=#size +#define _$count_c(size) ValidElementsConst=size +#define _$count_x(size) ValidElements="\n@"#size + +#define _$bytecount(size) ValidBytes=#size +#define _$bytecount_c(size) ValidBytesConst=size +#define _$bytecount_x(size) ValidBytes="\n@"#size + +#define _Pre1_impl_(p1) [SA_Pre(p1)] +#define _Pre2_impl_(p1,p2) [SA_Pre(p1,p2)] +#define _Pre3_impl_(p1,p2,p3) [SA_Pre(p1,p2,p3)] + +#define _Post1_impl_(p1) [SA_Post(p1)] +#define _Post2_impl_(p1,p2) [SA_Post(p1,p2)] +#define _Post3_impl_(p1,p2,p3) [SA_Post(p1,p2,p3)] + +#define _Ret1_impl_(p1) [returnvalue:SA_Post(p1)] +#define _Ret2_impl_(p1,p2) [returnvalue:SA_Post(p1,p2)] +#define _Ret3_impl_(p1,p2,p3) [returnvalue:SA_Post(p1,p2,p3)] + +#define _Deref_pre1_impl_(p1) [SA_Pre(Deref=1,p1)] +#define _Deref_pre2_impl_(p1,p2) [SA_Pre(Deref=1,p1,p2)] +#define _Deref_pre3_impl_(p1,p2,p3) [SA_Pre(Deref=1,p1,p2,p3)] + +#define _Deref_post1_impl_(p1) [SA_Post(Deref=1,p1)] +#define _Deref_post2_impl_(p1,p2) [SA_Post(Deref=1,p1,p2)] +#define _Deref_post3_impl_(p1,p2,p3) [SA_Post(Deref=1,p1,p2,p3)] + +#define _Deref_ret1_impl_(p1) [returnvalue:SA_Post(Deref=1,p1)] +#define _Deref_ret2_impl_(p1,p2) [returnvalue:SA_Post(Deref=1,p1,p2)] +#define _Deref_ret3_impl_(p1,p2,p3) [returnvalue:SA_Post(Deref=1,p1,p2,p3)] + +#define _Deref2_pre1_impl_(p1) [SA_Pre(Deref=2,p1)] +#define _Deref2_post1_impl_(p1) [SA_Post(Deref=2,p1)] +#define _Deref2_ret1_impl_(p1) [returnvalue:SA_Post(Deref=2,p1)] + +#elif _USE_DECLSPECS_FOR_SAL + +#define _$SPECSTRIZE( x ) #x + +#define _Check_return_impl_ __declspec("SAL_checkReturn") + +#define _Success_impl_(expr) __declspec("SAL_success("_$SPECSTRIZE(expr)")") + +#define _Printf_format_string_impl_ +#define _Scanf_format_string_impl_ +#define _Scanf_s_format_string_impl_ + +#define _In_bound_impl_ _$pre _$bound +#define _Out_bound_impl_ _$post _$bound +#define _Ret_bound_impl_ _$post _$bound +#define _Deref_in_bound_impl_ _$derefpre _$bound +#define _Deref_out_bound_impl_ _$derefpost _$bound +#define _Deref_ret_bound_impl_ _$derefpost bound + +#define _In_range_impl_(min,max) _$pre _$range(min,max) +#define _Out_range_impl_(min,max) _$post _$range(min,max) +#define _Ret_range_impl_(min,max) _$post _$range(min,max) +#define _Deref_in_range_impl_(min,max) _$derefpre _$range(min,max) +#define _Deref_out_range_impl_(min,max) _$derefpost _$range(min,max) +#define _Deref_ret_range_impl_(min,max) _$derefpost _$range(min,max) + +#define _$valid __declspec("SAL_valid") +#define _$maybevalid __declspec("SAL_maybevalid") +#define _$notvalid __declspec("SAL_notvalid") + +#define _$null __declspec("SAL_null") +#define _$maybenull __declspec("SAL_maybenull") +#define _$notnull __declspec("SAL_notnull") + +#define _$zterm __declspec("SAL_readableTo(sentinel(0))") +#define _$maybezterm +#define _$notzterm + +#define _$readaccess __declspec("SAL_readonly") +#define _$writeaccess __declspec("SAL_notreadonly") + +#define _$cap(size) __declspec("SAL_writableTo(elementCount("_$SPECSTRIZE(size)"))") +#define _$cap_c(size) __declspec("SAL_writableTo(elementCount("_$SPECSTRIZE(size)"))") +#define _$cap_for(param) __declspec("SAL_writableTo(needsCountFor("_$SPECSTRIZE(param)"))") +#define _$cap_x(size) __declspec("SAL_writableTo(inexpressibleCount('"_$SPECSTRIZE(size)"'))") + +#define _$bytecap(size) __declspec("SAL_writableTo(byteCount("_$SPECSTRIZE(size)"))") +#define _$bytecap_c(size) __declspec("SAL_writableTo(byteCount("_$SPECSTRIZE(size)"))") +#define _$bytecap_x(size) __declspec("SAL_writableTo(inexpressibleCount('"_$SPECSTRIZE(size)"'))") + +#define _$mult(mult,size) __declspec("SAL_writableTo(inexpressibleCount("_$SPECSTRIZE(mult)"*"_$SPECSTRIZE(size)"))") + +#define _$count(size) __declspec("SAL_readableTo(elementCount("_$SPECSTRIZE(size)"))") +#define _$count_c(size) __declspec("SAL_readableTo(elementCount("_$SPECSTRIZE(size)"))") +#define _$count_x(size) __declspec("SAL_readableTo(inexpressibleCount('"_$SPECSTRIZE(size)"'))") + +#define _$bytecount(size) __declspec("SAL_readableTo(byteCount("_$SPECSTRIZE(size)"))") +#define _$bytecount_c(size) __declspec("SAL_readableTo(byteCount("_$SPECSTRIZE(size)"))") +#define _$bytecount_x(size) __declspec("SAL_readableTo(inexpressibleCount('"_$SPECSTRIZE(size)"'))") + +#define _$pre __declspec("SAL_pre") +#define _$post __declspec("SAL_post") +#define _$deref_pre __declspec("SAL_pre") __declspec("SAL_deref") +#define _$deref_post __declspec("SAL_post") __declspec("SAL_deref") + +#define _$bound __declspec("SAL_bound") +#define _$range(min,max) __declspec("SAL_range("_$SPECSTRIZE(min)","_$SPECSTRIZE(max)")") + +#define _Pre1_impl_(p1) _$pre p1 +#define _Pre2_impl_(p1,p2) _$pre p1 _$pre p2 +#define _Pre3_impl_(p1,p2,p3) _$pre p1 _$pre p2 _$pre p3 + +#define _Post1_impl_(p1) _$post p1 +#define _Post2_impl_(p1,p2) _$post p1 _$post p2 +#define _Post3_impl_(p1,p2,p3) _$post p1 _$post p2 _$post p3 + +#define _Ret1_impl_(p1) _$post p1 +#define _Ret2_impl_(p1,p2) _$post p1 _$post p2 +#define _Ret3_impl_(p1,p2,p3) _$post p1 _$post p2 _$post p3 + +#define _Deref_pre1_impl_(p1) _$deref_pre p1 +#define _Deref_pre2_impl_(p1,p2) _$deref_pre p1 _$deref_pre p2 +#define _Deref_pre3_impl_(p1,p2,p3) _$deref_pre p1 _$deref_pre p2 _$deref_pre p3 + +#define _Deref_post1_impl_(p1) _$deref_post p1 +#define _Deref_post2_impl_(p1,p2) _$deref_post p1 _$deref_post p2 +#define _Deref_post3_impl_(p1,p2,p3) _$deref_post p1 _$deref_post p2 _$deref_post p3 + +#define _Deref_ret1_impl_(p1) _$deref_post p1 +#define _Deref_ret2_impl_(p1,p2) _$deref_post p1 _$deref_post p2 +#define _Deref_ret3_impl_(p1,p2,p3) _$deref_post p1 _$deref_post p2 _$deref_post p3 + +#define _Deref2_pre1_impl_(p1) _$deref_pre __declspec("SAL_deref") p1 +#define _Deref2_post1_impl_(p1) _$deref_post __declspec("SAL_deref") p1 +#define _Deref2_ret1_impl_(p1) _$deref_post __declspec("SAL_deref") p1 + +#elif defined(_MSC_EXTENSIONS) && !defined( MIDL_PASS ) && !defined(__midl) && !defined(RC_INVOKED) && defined(_PFT_VER) && _MSC_VER >= 1400 + +// minimum attribute expansion for foreground build + +#pragma push_macro( "SA" ) +#pragma push_macro( "REPEATABLE" ) + +#ifdef __cplusplus +#define SA( id ) id +#define REPEATABLE [repeatable] +#else // !__cplusplus +#define SA( id ) SA_##id +#define REPEATABLE +#endif // !__cplusplus + +REPEATABLE +[source_annotation_attribute( SA( Parameter ) )] +struct _$P +{ +#ifdef __cplusplus + _$P(); +#endif + int _$d; +}; +typedef struct _$P _$P; + +REPEATABLE +[source_annotation_attribute( SA( ReturnValue ) )] +struct _$R +{ +#ifdef __cplusplus + _$R(); +#endif + int _$d; +}; +typedef struct _$R _$R; + +[source_annotation_attribute( SA( Method ) )] +struct _$M +{ +#ifdef __cplusplus + _$M(); +#endif + int _$d; +}; +typedef struct _$M _$M; + +#pragma pop_macro( "REPEATABLE" ) +#pragma pop_macro( "SA" ) + +#define _Check_return_impl_ [returnvalue:_$R(_$d=0)] + +#define _Success_impl_(expr) [_$M(_$d=0)] + +#define _Printf_format_string_impl_ [_$P(_$d=0)] +#define _Scanf_format_string_impl_ [_$P(_$d=0)] +#define _Scanf_s_format_string_impl_ [_$P(_$d=0)] + +#define _In_bound_impl_ [_$P(_$d=0)] +#define _Out_bound_impl_ [_$P(_$d=0)] +#define _Ret_bound_impl_ [returnvalue:_$R(_$d=0)] +#define _Deref_in_bound_impl_ [_$P(_$d=0)] +#define _Deref_out_bound_impl_ [_$P(_$d=0)] +#define _Deref_ret_bound_impl_ [returnvalue:_$R(_$d=0)] + +#define _In_range_impl_(min,max) [_$P(_$d=0)] +#define _Out_range_impl_(min,max) [_$P(_$d=0)] +#define _Ret_range_impl_(min,max) [returnvalue:_$R(_$d=0)] +#define _Deref_in_range_impl_(min,max) [_$P(_$d=0)] +#define _Deref_out_range_impl_(min,max) [_$P(_$d=0)] +#define _Deref_ret_range_impl_(min,max) [returnvalue:_$R(_$d=0)] + +#define _Pre1_impl_(p1) [_$P(_$d=0)] +#define _Pre2_impl_(p1,p2) [_$P(_$d=0)] +#define _Pre3_impl_(p1,p2,p3) [_$P(_$d=0)] + +#define _Post1_impl_(p1) [_$P(_$d=0)] +#define _Post2_impl_(p1,p2) [_$P(_$d=0)] +#define _Post3_impl_(p1,p2,p3) [_$P(_$d=0)] + +#define _Ret1_impl_(p1) [returnvalue:_$R(_$d=0)] +#define _Ret2_impl_(p1,p2) [returnvalue:_$R(_$d=0)] +#define _Ret3_impl_(p1,p2,p3) [returnvalue:_$R(_$d=0)] + +#define _Deref_pre1_impl_(p1) [_$P(_$d=0)] +#define _Deref_pre2_impl_(p1,p2) [_$P(_$d=0)] +#define _Deref_pre3_impl_(p1,p2,p3) [_$P(_$d=0)] + +#define _Deref_post1_impl_(p1) [_$P(_$d=0)] +#define _Deref_post2_impl_(p1,p2) [_$P(_$d=0)] +#define _Deref_post3_impl_(p1,p2,p3) [_$P(_$d=0)] + +#define _Deref_ret1_impl_(p1) [returnvalue:_$R(_$d=0)] +#define _Deref_ret2_impl_(p1,p2) [returnvalue:_$R(_$d=0)] +#define _Deref_ret3_impl_(p1,p2,p3) [returnvalue:_$R(_$d=0)] + +#define _Deref2_pre1_impl_(p1) //[_$P(_$d=0)] +#define _Deref2_post1_impl_(p1) //[_$P(_$d=0)] +#define _Deref2_ret1_impl_(p1) //[_$P(_$d=0)] + +#else + +#define _Check_return_impl_ + +#define _Success_impl_(expr) + +#define _Printf_format_string_impl_ +#define _Scanf_format_string_impl_ +#define _Scanf_s_format_string_impl_ + +#define _In_bound_impl_ +#define _Out_bound_impl_ +#define _Ret_bound_impl_ +#define _Deref_in_bound_impl_ +#define _Deref_out_bound_impl_ +#define _Deref_ret_bound_impl_ + +#define _In_range_impl_(min,max) +#define _Out_range_impl_(min,max) +#define _Ret_range_impl_(min,max) +#define _Deref_in_range_impl_(min,max) +#define _Deref_out_range_impl_(min,max) +#define _Deref_ret_range_impl_(min,max) + +#define _Pre1_impl_(p1) +#define _Pre2_impl_(p1,p2) +#define _Pre3_impl_(p1,p2,p3) + +#define _Post1_impl_(p1) +#define _Post2_impl_(p1,p2) +#define _Post3_impl_(p1,p2,p3) + +#define _Ret1_impl_(p1) +#define _Ret2_impl_(p1,p2) +#define _Ret3_impl_(p1,p2,p3) + +#define _Deref_pre1_impl_(p1) +#define _Deref_pre2_impl_(p1,p2) +#define _Deref_pre3_impl_(p1,p2,p3) + +#define _Deref_post1_impl_(p1) +#define _Deref_post2_impl_(p1,p2) +#define _Deref_post3_impl_(p1,p2,p3) + +#define _Deref_ret1_impl_(p1) +#define _Deref_ret2_impl_(p1,p2) +#define _Deref_ret3_impl_(p1,p2,p3) + +#define _Deref2_pre1_impl_(p1) +#define _Deref2_post1_impl_(p1) +#define _Deref2_ret1_impl_(p1) + +#endif + +// This section contains the deprecated annotations + +/* + ------------------------------------------------------------------------------- + Introduction + + sal.h provides a set of annotations to describe how a function uses its + parameters - the assumptions it makes about them, and the guarantees it makes + upon finishing. + + Annotations may be placed before either a function parameter's type or its return + type, and describe the function's behavior regarding the parameter or return value. + There are two classes of annotations: buffer annotations and advanced annotations. + Buffer annotations describe how functions use their pointer parameters, and + advanced annotations either describe complex/unusual buffer behavior, or provide + additional information about a parameter that is not otherwise expressible. + + ------------------------------------------------------------------------------- + Buffer Annotations + + The most important annotations in sal.h provide a consistent way to annotate + buffer parameters or return values for a function. Each of these annotations describes + a single buffer (which could be a string, a fixed-length or variable-length array, + or just a pointer) that the function interacts with: where it is, how large it is, + how much is initialized, and what the function does with it. + + The appropriate macro for a given buffer can be constructed using the table below. + Just pick the appropriate values from each category, and combine them together + with a leading underscore. Some combinations of values do not make sense as buffer + annotations. Only meaningful annotations can be added to your code; for a list of + these, see the buffer annotation definitions section. + + Only a single buffer annotation should be used for each parameter. + + |------------|------------|---------|--------|----------|----------|---------------| + | Level | Usage | Size | Output | NullTerm | Optional | Parameters | + |------------|------------|---------|--------|----------|----------|---------------| + | <> | <> | <> | <> | _z | <> | <> | + | _deref | _in | _ecount | _full | _nz | _opt | (size) | + | _deref_opt | _out | _bcount | _part | | | (size,length) | + | | _inout | | | | | | + | | | | | | | | + |------------|------------|---------|--------|----------|----------|---------------| + + Level: Describes the buffer pointer's level of indirection from the parameter or + return value 'p'. + + <> : p is the buffer pointer. + _deref : *p is the buffer pointer. p must not be NULL. + _deref_opt : *p may be the buffer pointer. p may be NULL, in which case the rest of + the annotation is ignored. + + Usage: Describes how the function uses the buffer. + + <> : The buffer is not accessed. If used on the return value or with _deref, the + function will provide the buffer, and it will be uninitialized at exit. + Otherwise, the caller must provide the buffer. This should only be used + for alloc and free functions. + _in : The function will only read from the buffer. The caller must provide the + buffer and initialize it. Cannot be used with _deref. + _out : The function will only write to the buffer. If used on the return value or + with _deref, the function will provide the buffer and initialize it. + Otherwise, the caller must provide the buffer, and the function will + initialize it. + _inout : The function may freely read from and write to the buffer. The caller must + provide the buffer and initialize it. If used with _deref, the buffer may + be reallocated by the function. + + Size: Describes the total size of the buffer. This may be less than the space actually + allocated for the buffer, in which case it describes the accessible amount. + + <> : No buffer size is given. If the type specifies the buffer size (such as + with LPSTR and LPWSTR), that amount is used. Otherwise, the buffer is one + element long. Must be used with _in, _out, or _inout. + _ecount : The buffer size is an explicit element count. + _bcount : The buffer size is an explicit byte count. + + Output: Describes how much of the buffer will be initialized by the function. For + _inout buffers, this also describes how much is initialized at entry. Omit this + category for _in buffers; they must be fully initialized by the caller. + + <> : The type specifies how much is initialized. For instance, a function initializing + an LPWSTR must NULL-terminate the string. + _full : The function initializes the entire buffer. + _part : The function initializes part of the buffer, and explicitly indicates how much. + + NullTerm: States if the present of a '\0' marks the end of valid elements in the buffer. + _z : A '\0' indicated the end of the buffer + _nz : The buffer may not be null terminated and a '\0' does not indicate the end of the + buffer. + Optional: Describes if the buffer itself is optional. + + <> : The pointer to the buffer must not be NULL. + _opt : The pointer to the buffer might be NULL. It will be checked before being dereferenced. + + Parameters: Gives explicit counts for the size and length of the buffer. + + <> : There is no explicit count. Use when neither _ecount nor _bcount is used. + (size) : Only the buffer's total size is given. Use with _ecount or _bcount but not _part. + (size,length) : The buffer's total size and initialized length are given. Use with _ecount_part + and _bcount_part. + + ------------------------------------------------------------------------------- + Buffer Annotation Examples + + LWSTDAPI_(BOOL) StrToIntExA( + LPCSTR pszString, -- No annotation required, const implies __in. + DWORD dwFlags, + __out int *piRet -- A pointer whose dereference will be filled in. + ); + + void MyPaintingFunction( + __in HWND hwndControl, -- An initialized read-only parameter. + __in_opt HDC hdcOptional, -- An initialized read-only parameter that might be NULL. + __inout IPropertyStore *ppsStore -- An initialized parameter that may be freely used + -- and modified. + ); + + LWSTDAPI_(BOOL) PathCompactPathExA( + __out_ecount(cchMax) LPSTR pszOut, -- A string buffer with cch elements that will + -- be NULL terminated on exit. + LPCSTR pszSrc, -- No annotation required, const implies __in. + UINT cchMax, + DWORD dwFlags + ); + + HRESULT SHLocalAllocBytes( + size_t cb, + __deref_bcount(cb) T **ppv -- A pointer whose dereference will be set to an + -- uninitialized buffer with cb bytes. + ); + + __inout_bcount_full(cb) : A buffer with cb elements that is fully initialized at + entry and exit, and may be written to by this function. + + __out_ecount_part(count, *countOut) : A buffer with count elements that will be + partially initialized by this function. The function indicates how much it + initialized by setting *countOut. + + ------------------------------------------------------------------------------- + Advanced Annotations + + Advanced annotations describe behavior that is not expressible with the regular + buffer macros. These may be used either to annotate buffer parameters that involve + complex or conditional behavior, or to enrich existing annotations with additional + information. + + __success(expr) f : + <expr> indicates whether function f succeeded or not. If <expr> is true at exit, + all the function's guarantees (as given by other annotations) must hold. If <expr> + is false at exit, the caller should not expect any of the function's guarantees + to hold. If not used, the function must always satisfy its guarantees. Added + automatically to functions that indicate success in standard ways, such as by + returning an HRESULT. + + __nullterminated p : + Pointer p is a buffer that may be read or written up to and including the first + NULL character or pointer. May be used on typedefs, which marks valid (properly + initialized) instances of that type as being NULL-terminated. + + __nullnullterminated p : + Pointer p is a buffer that may be read or written up to and including the first + sequence of two NULL characters or pointers. May be used on typedefs, which marks + valid instances of that type as being double-NULL terminated. + + __reserved v : + Value v must be 0/NULL, reserved for future use. + + __checkReturn v : + Return value v must not be ignored by callers of this function. + + __typefix(ctype) v : + Value v should be treated as an instance of ctype, rather than its declared type. + + __override f : + Specify C#-style 'override' behaviour for overriding virtual methods. + + __callback f : + Function f can be used as a function pointer. + + __format_string p : + Pointer p is a string that contains % markers in the style of printf. + + __blocksOn(resource) f : + Function f blocks on the resource 'resource'. + + __fallthrough : + Annotates switch statement labels where fall-through is desired, to distinguish + from forgotten break statements. + + ------------------------------------------------------------------------------- + Advanced Annotation Examples + + __success(return == TRUE) LWSTDAPI_(BOOL) + PathCanonicalizeA(__out_ecount(MAX_PATH) LPSTR pszBuf, LPCSTR pszPath) : + pszBuf is only guaranteed to be NULL-terminated when TRUE is returned. + + typedef __nullterminated WCHAR* LPWSTR : Initialized LPWSTRs are NULL-terminated strings. + + __out_ecount(cch) __typefix(LPWSTR) void *psz : psz is a buffer parameter which will be + a NULL-terminated WCHAR string at exit, and which initially contains cch WCHARs. + + ------------------------------------------------------------------------------- +*/ + +#define __specstrings + +#ifdef __cplusplus +#ifndef __nothrow +# define __nothrow __declspec(nothrow) +#endif +extern "C" { +#else +#ifndef __nothrow +# define __nothrow +#endif +#endif /* #ifdef __cplusplus */ + + +/* + ------------------------------------------------------------------------------- + Helper Macro Definitions + + These express behavior common to many of the high-level annotations. + DO NOT USE THESE IN YOUR CODE. + ------------------------------------------------------------------------------- +*/ + +/* +The helper annotations are only understood by the compiler version used by various +defect detection tools. When the regular compiler is running, they are defined into +nothing, and do not affect the compiled code. +*/ + +#if !defined(__midl) && defined(_PREFAST_) + + /* + In the primitive __declspec("SAL_*") annotations "SAL" stands for Standard + Annotation Language. These __declspec("SAL_*") annotations are the + primitives the compiler understands and all high-level SpecString MACROs + will decompose into these primivates. + */ + + #define SPECSTRINGIZE( x ) #x + + /* + __null p + __notnull p + __maybenull p + + Annotates a pointer p. States that pointer p is null. Commonly used + in the negated form __notnull or the possibly null form __maybenull. + */ + + #define __null __declspec("SAL_null") + #define __notnull __declspec("SAL_notnull") + #define __maybenull __declspec("SAL_maybenull") + + /* + __readonly l + __notreadonly l + __mabyereadonly l + + Annotates a location l. States that location l is not modified after + this point. If the annotation is placed on the precondition state of + a function, the restriction only applies until the postcondition state + of the function. __maybereadonly states that the annotated location + may be modified, whereas __notreadonly states that a location must be + modified. + */ + + #define __readonly __declspec("SAL_readonly") + #define __notreadonly __declspec("SAL_notreadonly") + #define __maybereadonly __declspec("SAL_maybereadonly") + + /* + __valid v + __notvalid v + __maybevalid v + + Annotates any value v. States that the value satisfies all properties of + valid values of its type. For example, for a string buffer, valid means + that the buffer pointer is either NULL or points to a NULL-terminated string. + */ + + #define __valid __declspec("SAL_valid") + #define __notvalid __declspec("SAL_notvalid") + #define __maybevalid __declspec("SAL_maybevalid") + + /* + __readableTo(extent) p + + Annotates a buffer pointer p. If the buffer can be read, extent describes + how much of the buffer is readable. For a reader of the buffer, this is + an explicit permission to read up to that amount, rather than a restriction to + read only up to it. + */ + + #define __readableTo(extent) __declspec("SAL_readableTo("SPECSTRINGIZE(extent)")") + + /* + + __elem_readableTo(size) + + Annotates a buffer pointer p as being readable to size elements. + */ + + #define __elem_readableTo(size) __declspec("SAL_readableTo(elementCount("SPECSTRINGIZE(size)"))") + + /* + __byte_readableTo(size) + + Annotates a buffer pointer p as being readable to size bytes. + */ + #define __byte_readableTo(size) __declspec("SAL_readableTo(byteCount("SPECSTRINGIZE(size)"))") + + /* + __writableTo(extent) p + + Annotates a buffer pointer p. If the buffer can be modified, extent + describes how much of the buffer is writable (usually the allocation + size). For a writer of the buffer, this is an explicit permission to + write up to that amount, rather than a restriction to write only up to it. + */ + #define __writableTo(size) __declspec("SAL_writableTo("SPECSTRINGIZE(size)")") + + /* + __elem_writableTo(size) + + Annotates a buffer pointer p as being writable to size elements. + */ + #define __elem_writableTo(size) __declspec("SAL_writableTo(elementCount("SPECSTRINGIZE(size)"))") + + /* + __byte_writableTo(size) + + Annotates a buffer pointer p as being writable to size bytes. + */ + #define __byte_writableTo(size) __declspec("SAL_writableTo(byteCount("SPECSTRINGIZE(size)"))") + + /* + __deref p + + Annotates a pointer p. The next annotation applies one dereference down + in the type. If readableTo(p, size) then the next annotation applies to + all elements *(p+i) for which i satisfies the size. If p is a pointer + to a struct, the next annotation applies to all fields of the struct. + */ + #define __deref __declspec("SAL_deref") + + /* + __pre __next_annotation + + The next annotation applies in the precondition state + */ + #define __pre __declspec("SAL_pre") + + /* + __post __next_annotation + + The next annotation applies in the postcondition state + */ + #define __post __declspec("SAL_post") + + /* + __precond(<expr>) + + When <expr> is true, the next annotation applies in the precondition state + (currently not enabled) + */ + #define __precond(expr) __pre + + /* + __postcond(<expr>) + + When <expr> is true, the next annotation applies in the postcondition state + (currently not enabled) + */ + #define __postcond(expr) __post + + /* + __exceptthat + + Given a set of annotations Q containing __exceptthat maybeP, the effect of + the except clause is to erase any P or notP annotations (explicit or + implied) within Q at the same level of dereferencing that the except + clause appears, and to replace it with maybeP. + + Example 1: __valid __exceptthat __maybenull on a pointer p means that the + pointer may be null, and is otherwise valid, thus overriding + the implicit notnull annotation implied by __valid on + pointers. + + Example 2: __valid __deref __exceptthat __maybenull on an int **p means + that p is not null (implied by valid), but the elements + pointed to by p could be null, and are otherwise valid. + */ + #define __exceptthat __declspec("SAL_except") + #define __execeptthat __exceptthat + + /* + _refparam + + Added to all out parameter macros to indicate that they are all reference + parameters. + */ + #define __refparam __deref __notreadonly + + /* + __inner_* + + Helper macros that directly correspond to certain high-level annotations. + + */ + + /* + Macros to classify the entrypoints and indicate their category. + + Pre-defined control point categories include: RPC, LPC, DeviceDriver, UserToKernel, ISAPI, COM. + + */ + #define __inner_control_entrypoint(category) __declspec("SAL_entrypoint(controlEntry, "SPECSTRINGIZE(category)")") + + /* + Pre-defined data entry point categories include: Registry, File, Network. + */ + #define __inner_data_entrypoint(category) __declspec("SAL_entrypoint(dataEntry, "SPECSTRINGIZE(category)")") + + #define __inner_success(expr) __declspec("SAL_success("SPECSTRINGIZE(expr)")") + #define __inner_checkReturn __declspec("SAL_checkReturn") + #define __inner_typefix(ctype) __declspec("SAL_typefix("SPECSTRINGIZE(ctype)")") + #define __inner_override __declspec("__override") + #define __inner_callback __declspec("__callback") + #define __inner_blocksOn(resource) __declspec("SAL_blocksOn("SPECSTRINGIZE(resource)")") + #define __inner_fallthrough_dec __inline __nothrow void __FallThrough() {} + #define __inner_fallthrough __FallThrough(); + +#else + #define __null + #define __notnull + #define __maybenull + #define __readonly + #define __notreadonly + #define __maybereadonly + #define __valid + #define __notvalid + #define __maybevalid + #define __readableTo(extent) + #define __elem_readableTo(size) + #define __byte_readableTo(size) + #define __writableTo(size) + #define __elem_writableTo(size) + #define __byte_writableTo(size) + #define __deref + #define __pre + #define __post + #define __precond(expr) + #define __postcond(expr) + #define __exceptthat + #define __execeptthat + #define __inner_success(expr) + #define __inner_checkReturn + #define __inner_typefix(ctype) + #define __inner_override + #define __inner_callback + #define __inner_blocksOn(resource) + #define __inner_fallthrough_dec + #define __inner_fallthrough + #define __refparam + #define __inner_control_entrypoint(category) + #define __inner_data_entrypoint(category) +#endif /* #if !defined(__midl) && defined(_PREFAST_) */ + +/* +------------------------------------------------------------------------------- +Buffer Annotation Definitions + +Any of these may be used to directly annotate functions, but only one should +be used for each parameter. To determine which annotation to use for a given +buffer, use the table in the buffer annotations section. +------------------------------------------------------------------------------- +*/ + +#define __ecount(size) __notnull __elem_writableTo(size) +#define __bcount(size) __notnull __byte_writableTo(size) +#define __in __pre __valid __pre __deref __readonly +#define __in_ecount(size) __in __pre __elem_readableTo(size) +#define __in_bcount(size) __in __pre __byte_readableTo(size) +#define __in_z __in __pre __nullterminated +#define __in_ecount_z(size) __in_ecount(size) __pre __nullterminated +#define __in_bcount_z(size) __in_bcount(size) __pre __nullterminated +#define __in_nz __in +#define __in_ecount_nz(size) __in_ecount(size) +#define __in_bcount_nz(size) __in_bcount(size) +#define __out __ecount(1) __post __valid __refparam +#define __out_ecount(size) __ecount(size) __post __valid __refparam +#define __out_bcount(size) __bcount(size) __post __valid __refparam +#define __out_ecount_part(size,length) __out_ecount(size) __post __elem_readableTo(length) +#define __out_bcount_part(size,length) __out_bcount(size) __post __byte_readableTo(length) +#define __out_ecount_full(size) __out_ecount_part(size,size) +#define __out_bcount_full(size) __out_bcount_part(size,size) +#define __out_z __post __valid __refparam __post __nullterminated +#define __out_z_opt __post __valid __refparam __post __nullterminated __exceptthat __maybenull +#define __out_ecount_z(size) __ecount(size) __post __valid __refparam __post __nullterminated +#define __out_bcount_z(size) __bcount(size) __post __valid __refparam __post __nullterminated +#define __out_ecount_part_z(size,length) __out_ecount_part(size,length) __post __nullterminated +#define __out_bcount_part_z(size,length) __out_bcount_part(size,length) __post __nullterminated +#define __out_ecount_full_z(size) __out_ecount_full(size) __post __nullterminated +#define __out_bcount_full_z(size) __out_bcount_full(size) __post __nullterminated +#define __out_nz __post __valid __refparam __post +#define __out_nz_opt __post __valid __refparam __post __exceptthat __maybenull +#define __out_ecount_nz(size) __ecount(size) __post __valid __refparam +#define __out_bcount_nz(size) __bcount(size) __post __valid __refparam +#define __inout __pre __valid __post __valid __refparam +#define __inout_ecount(size) __out_ecount(size) __pre __valid +#define __inout_bcount(size) __out_bcount(size) __pre __valid +#define __inout_ecount_part(size,length) __out_ecount_part(size,length) __pre __valid __pre __elem_readableTo(length) +#define __inout_bcount_part(size,length) __out_bcount_part(size,length) __pre __valid __pre __byte_readableTo(length) +#define __inout_ecount_full(size) __inout_ecount_part(size,size) +#define __inout_bcount_full(size) __inout_bcount_part(size,size) +#define __inout_z __inout __pre __nullterminated __post __nullterminated +#define __inout_ecount_z(size) __inout_ecount(size) __pre __nullterminated __post __nullterminated +#define __inout_bcount_z(size) __inout_bcount(size) __pre __nullterminated __post __nullterminated +#define __inout_nz __inout +#define __inout_ecount_nz(size) __inout_ecount(size) +#define __inout_bcount_nz(size) __inout_bcount(size) +#define __ecount_opt(size) __ecount(size) __exceptthat __maybenull +#define __bcount_opt(size) __bcount(size) __exceptthat __maybenull +#define __in_opt __in __exceptthat __maybenull +#define __in_ecount_opt(size) __in_ecount(size) __exceptthat __maybenull +#define __in_bcount_opt(size) __in_bcount(size) __exceptthat __maybenull +#define __in_z_opt __in_opt __pre __nullterminated +#define __in_ecount_z_opt(size) __in_ecount_opt(size) __pre __nullterminated +#define __in_bcount_z_opt(size) __in_bcount_opt(size) __pre __nullterminated +#define __in_nz_opt __in_opt +#define __in_ecount_nz_opt(size) __in_ecount_opt(size) +#define __in_bcount_nz_opt(size) __in_bcount_opt(size) +#define __out_opt __out __exceptthat __maybenull +#define __out_ecount_opt(size) __out_ecount(size) __exceptthat __maybenull +#define __out_bcount_opt(size) __out_bcount(size) __exceptthat __maybenull +#define __out_ecount_part_opt(size,length) __out_ecount_part(size,length) __exceptthat __maybenull +#define __out_bcount_part_opt(size,length) __out_bcount_part(size,length) __exceptthat __maybenull +#define __out_ecount_full_opt(size) __out_ecount_full(size) __exceptthat __maybenull +#define __out_bcount_full_opt(size) __out_bcount_full(size) __exceptthat __maybenull +#define __out_ecount_z_opt(size) __out_ecount_opt(size) __post __nullterminated +#define __out_bcount_z_opt(size) __out_bcount_opt(size) __post __nullterminated +#define __out_ecount_part_z_opt(size,length) __out_ecount_part_opt(size,length) __post __nullterminated +#define __out_bcount_part_z_opt(size,length) __out_bcount_part_opt(size,length) __post __nullterminated +#define __out_ecount_full_z_opt(size) __out_ecount_full_opt(size) __post __nullterminated +#define __out_bcount_full_z_opt(size) __out_bcount_full_opt(size) __post __nullterminated +#define __out_ecount_nz_opt(size) __out_ecount_opt(size) __post __nullterminated +#define __out_bcount_nz_opt(size) __out_bcount_opt(size) __post __nullterminated +#define __inout_opt __inout __exceptthat __maybenull +#define __inout_ecount_opt(size) __inout_ecount(size) __exceptthat __maybenull +#define __inout_bcount_opt(size) __inout_bcount(size) __exceptthat __maybenull +#define __inout_ecount_part_opt(size,length) __inout_ecount_part(size,length) __exceptthat __maybenull +#define __inout_bcount_part_opt(size,length) __inout_bcount_part(size,length) __exceptthat __maybenull +#define __inout_ecount_full_opt(size) __inout_ecount_full(size) __exceptthat __maybenull +#define __inout_bcount_full_opt(size) __inout_bcount_full(size) __exceptthat __maybenull +#define __inout_z_opt __inout_opt __pre __nullterminated __post __nullterminated +#define __inout_ecount_z_opt(size) __inout_ecount_opt(size) __pre __nullterminated __post __nullterminated +#define __inout_ecount_z_opt(size) __inout_ecount_opt(size) __pre __nullterminated __post __nullterminated +#define __inout_bcount_z_opt(size) __inout_bcount_opt(size) +#define __inout_nz_opt __inout_opt +#define __inout_ecount_nz_opt(size) __inout_ecount_opt(size) +#define __inout_bcount_nz_opt(size) __inout_bcount_opt(size) +#define __deref_ecount(size) __ecount(1) __post __elem_readableTo(1) __post __deref __notnull __post __deref __elem_writableTo(size) +#define __deref_bcount(size) __ecount(1) __post __elem_readableTo(1) __post __deref __notnull __post __deref __byte_writableTo(size) +#define __deref_out __deref_ecount(1) __post __deref __valid __refparam +#define __deref_out_ecount(size) __deref_ecount(size) __post __deref __valid __refparam +#define __deref_out_bcount(size) __deref_bcount(size) __post __deref __valid __refparam +#define __deref_out_ecount_part(size,length) __deref_out_ecount(size) __post __deref __elem_readableTo(length) +#define __deref_out_bcount_part(size,length) __deref_out_bcount(size) __post __deref __byte_readableTo(length) +#define __deref_out_ecount_full(size) __deref_out_ecount_part(size,size) +#define __deref_out_bcount_full(size) __deref_out_bcount_part(size,size) +#define __deref_out_z __post __deref __valid __refparam __post __deref __nullterminated +#define __deref_out_ecount_z(size) __deref_out_ecount(size) __post __deref __nullterminated +#define __deref_out_bcount_z(size) __deref_out_ecount(size) __post __deref __nullterminated +#define __deref_out_nz __deref_out +#define __deref_out_ecount_nz(size) __deref_out_ecount(size) +#define __deref_out_bcount_nz(size) __deref_out_ecount(size) +#define __deref_inout __notnull __elem_readableTo(1) __pre __deref __valid __post __deref __valid __refparam +#define __deref_inout_z __deref_inout __pre __deref __nullterminated __post __deref __nullterminated +#define __deref_inout_ecount(size) __deref_inout __pre __deref __elem_writableTo(size) __post __deref __elem_writableTo(size) +#define __deref_inout_bcount(size) __deref_inout __pre __deref __byte_writableTo(size) __post __deref __byte_writableTo(size) +#define __deref_inout_ecount_part(size,length) __deref_inout_ecount(size) __pre __deref __elem_readableTo(length) __post __deref __elem_readableTo(length) +#define __deref_inout_bcount_part(size,length) __deref_inout_bcount(size) __pre __deref __byte_readableTo(length) __post __deref __byte_readableTo(length) +#define __deref_inout_ecount_full(size) __deref_inout_ecount_part(size,size) +#define __deref_inout_bcount_full(size) __deref_inout_bcount_part(size,size) +#define __deref_inout_z __deref_inout __pre __deref __nullterminated __post __deref __nullterminated +#define __deref_inout_ecount_z(size) __deref_inout_ecount(size) __pre __deref __nullterminated __post __deref __nullterminated +#define __deref_inout_bcount_z(size) __deref_inout_bcount(size) __pre __deref __nullterminated __post __deref __nullterminated +#define __deref_inout_nz __deref_inout +#define __deref_inout_ecount_nz(size) __deref_inout_ecount(size) +#define __deref_inout_bcount_nz(size) __deref_inout_ecount(size) +#define __deref_ecount_opt(size) __deref_ecount(size) __post __deref __exceptthat __maybenull +#define __deref_bcount_opt(size) __deref_bcount(size) __post __deref __exceptthat __maybenull +#define __deref_out_opt __deref_out __post __deref __exceptthat __maybenull +#define __deref_out_ecount_opt(size) __deref_out_ecount(size) __post __deref __exceptthat __maybenull +#define __deref_out_bcount_opt(size) __deref_out_bcount(size) __post __deref __exceptthat __maybenull +#define __deref_out_ecount_part_opt(size,length) __deref_out_ecount_part(size,length) __post __deref __exceptthat __maybenull +#define __deref_out_bcount_part_opt(size,length) __deref_out_bcount_part(size,length) __post __deref __exceptthat __maybenull +#define __deref_out_ecount_full_opt(size) __deref_out_ecount_full(size) __post __deref __exceptthat __maybenull +#define __deref_out_bcount_full_opt(size) __deref_out_bcount_full(size) __post __deref __exceptthat __maybenull +#define __deref_out_z_opt __post __deref __valid __refparam __execeptthat __maybenull __post __deref __nullterminated +#define __deref_out_ecount_z_opt(size) __deref_out_ecount_opt(size) __post __deref __nullterminated +#define __deref_out_bcount_z_opt(size) __deref_out_bcount_opt(size) __post __deref __nullterminated +#define __deref_out_nz_opt __deref_out_opt +#define __deref_out_ecount_nz_opt(size) __deref_out_ecount_opt(size) +#define __deref_out_bcount_nz_opt(size) __deref_out_bcount_opt(size) +#define __deref_inout_opt __deref_inout __pre __deref __exceptthat __maybenull __post __deref __exceptthat __maybenull +#define __deref_inout_ecount_opt(size) __deref_inout_ecount(size) __pre __deref __exceptthat __maybenull __post __deref __exceptthat __maybenull +#define __deref_inout_bcount_opt(size) __deref_inout_bcount(size) __pre __deref __exceptthat __maybenull __post __deref __exceptthat __maybenull +#define __deref_inout_ecount_part_opt(size,length) __deref_inout_ecount_part(size,length) __pre __deref __exceptthat __maybenull __post __deref __exceptthat __maybenull +#define __deref_inout_bcount_part_opt(size,length) __deref_inout_bcount_part(size,length) __pre __deref __exceptthat __maybenull __post __deref __exceptthat __maybenull +#define __deref_inout_ecount_full_opt(size) __deref_inout_ecount_full(size) __pre __deref __exceptthat __maybenull __post __deref __exceptthat __maybenull +#define __deref_inout_bcount_full_opt(size) __deref_inout_bcount_full(size) __pre __deref __exceptthat __maybenull __post __deref __exceptthat __maybenull +#define __deref_inout_z_opt __deref_inout_opt __pre __deref __nullterminated __post __deref __nullterminated +#define __deref_inout_ecount_z_opt(size) __deref_inout_ecount_opt(size) __pre __deref __nullterminated __post __deref __nullterminated +#define __deref_inout_bcount_z_opt(size) __deref_inout_bcount_opt(size) __pre __deref __nullterminated __post __deref __nullterminated +#define __deref_inout_nz_opt __deref_inout_opt +#define __deref_inout_ecount_nz_opt(size) __deref_inout_ecount_opt(size) +#define __deref_inout_bcount_nz_opt(size) __deref_inout_bcount_opt(size) +#define __deref_opt_ecount(size) __deref_ecount(size) __exceptthat __maybenull +#define __deref_opt_bcount(size) __deref_bcount(size) __exceptthat __maybenull +#define __deref_opt_out __deref_out __exceptthat __maybenull +#define __deref_opt_out_z __deref_opt_out __post __deref __nullterminated +#define __deref_opt_out_ecount(size) __deref_out_ecount(size) __exceptthat __maybenull +#define __deref_opt_out_bcount(size) __deref_out_bcount(size) __exceptthat __maybenull +#define __deref_opt_out_ecount_part(size,length) __deref_out_ecount_part(size,length) __exceptthat __maybenull +#define __deref_opt_out_bcount_part(size,length) __deref_out_bcount_part(size,length) __exceptthat __maybenull +#define __deref_opt_out_ecount_full(size) __deref_out_ecount_full(size) __exceptthat __maybenull +#define __deref_opt_out_bcount_full(size) __deref_out_bcount_full(size) __exceptthat __maybenull +#define __deref_opt_inout __deref_inout __exceptthat __maybenull +#define __deref_opt_inout_ecount(size) __deref_inout_ecount(size) __exceptthat __maybenull +#define __deref_opt_inout_bcount(size) __deref_inout_bcount(size) __exceptthat __maybenull +#define __deref_opt_inout_ecount_part(size,length) __deref_inout_ecount_part(size,length) __exceptthat __maybenull +#define __deref_opt_inout_bcount_part(size,length) __deref_inout_bcount_part(size,length) __exceptthat __maybenull +#define __deref_opt_inout_ecount_full(size) __deref_inout_ecount_full(size) __exceptthat __maybenull +#define __deref_opt_inout_bcount_full(size) __deref_inout_bcount_full(size) __exceptthat __maybenull +#define __deref_opt_inout_z __deref_opt_inout __pre __deref __nullterminated __post __deref __nullterminated +#define __deref_opt_inout_ecount_z(size) __deref_opt_inout_ecount(size) __pre __deref __nullterminated __post __deref __nullterminated +#define __deref_opt_inout_bcount_z(size) __deref_opt_inout_bcount(size) __pre __deref __nullterminated __post __deref __nullterminated +#define __deref_opt_inout_nz __deref_opt_inout +#define __deref_opt_inout_ecount_nz(size) __deref_opt_inout_ecount(size) +#define __deref_opt_inout_bcount_nz(size) __deref_opt_inout_bcount(size) +#define __deref_opt_ecount_opt(size) __deref_ecount_opt(size) __exceptthat __maybenull +#define __deref_opt_bcount_opt(size) __deref_bcount_opt(size) __exceptthat __maybenull +#define __deref_opt_out_opt __deref_out_opt __exceptthat __maybenull +#define __deref_opt_out_ecount_opt(size) __deref_out_ecount_opt(size) __exceptthat __maybenull +#define __deref_opt_out_bcount_opt(size) __deref_out_bcount_opt(size) __exceptthat __maybenull +#define __deref_opt_out_ecount_part_opt(size,length) __deref_out_ecount_part_opt(size,length) __exceptthat __maybenull +#define __deref_opt_out_bcount_part_opt(size,length) __deref_out_bcount_part_opt(size,length) __exceptthat __maybenull +#define __deref_opt_out_ecount_full_opt(size) __deref_out_ecount_full_opt(size) __exceptthat __maybenull +#define __deref_opt_out_bcount_full_opt(size) __deref_out_bcount_full_opt(size) __exceptthat __maybenull +#define __deref_opt_out_z_opt __post __deref __valid __refparam __exceptthat __maybenull __pre __deref __exceptthat __maybenull __post __deref __exceptthat __maybenull __post __deref __nullterminated +#define __deref_opt_out_ecount_z_opt(size) __deref_opt_out_ecount_opt(size) __post __deref __nullterminated +#define __deref_opt_out_bcount_z_opt(size) __deref_opt_out_bcount_opt(size) __post __deref __nullterminated +#define __deref_opt_out_nz_opt __deref_opt_out_opt +#define __deref_opt_out_ecount_nz_opt(size) __deref_opt_out_ecount_opt(size) +#define __deref_opt_out_bcount_nz_opt(size) __deref_opt_out_bcount_opt(size) +#define __deref_opt_inout_opt __deref_inout_opt __exceptthat __maybenull +#define __deref_opt_inout_ecount_opt(size) __deref_inout_ecount_opt(size) __exceptthat __maybenull +#define __deref_opt_inout_bcount_opt(size) __deref_inout_bcount_opt(size) __exceptthat __maybenull +#define __deref_opt_inout_ecount_part_opt(size,length) __deref_inout_ecount_part_opt(size,length) __exceptthat __maybenull +#define __deref_opt_inout_bcount_part_opt(size,length) __deref_inout_bcount_part_opt(size,length) __exceptthat __maybenull +#define __deref_opt_inout_ecount_full_opt(size) __deref_inout_ecount_full_opt(size) __exceptthat __maybenull +#define __deref_opt_inout_bcount_full_opt(size) __deref_inout_bcount_full_opt(size) __exceptthat __maybenull +#define __deref_opt_inout_z_opt __deref_opt_inout_opt __pre __deref __nullterminated __post __deref __nullterminated +#define __deref_opt_inout_ecount_z_opt(size) __deref_opt_inout_ecount_opt(size) __pre __deref __nullterminated __post __deref __nullterminated +#define __deref_opt_inout_bcount_z_opt(size) __deref_opt_inout_bcount_opt(size) __pre __deref __nullterminated __post __deref __nullterminated +#define __deref_opt_inout_nz_opt __deref_opt_inout_opt +#define __deref_opt_inout_ecount_nz_opt(size) __deref_opt_inout_ecount_opt(size) +#define __deref_opt_inout_bcount_nz_opt(size) __deref_opt_inout_bcount_opt(size) + +/* +------------------------------------------------------------------------------- +Advanced Annotation Definitions + +Any of these may be used to directly annotate functions, and may be used in +combination with each other or with regular buffer macros. For an explanation +of each annotation, see the advanced annotations section. +------------------------------------------------------------------------------- +*/ + +#define __success(expr) __inner_success(expr) +#define __nullterminated __readableTo(sentinel(0)) +#define __nullnullterminated +#define __reserved __pre __null +#define __checkReturn __inner_checkReturn +#define __typefix(ctype) __inner_typefix(ctype) +#define __override __inner_override +#define __callback __inner_callback +#define __format_string +#define __blocksOn(resource) __inner_blocksOn(resource) +#define __control_entrypoint(category) __inner_control_entrypoint(category) +#define __data_entrypoint(category) __inner_data_entrypoint(category) + +#ifndef __fallthrough + __inner_fallthrough_dec + #define __fallthrough __inner_fallthrough +#endif + +#ifndef __analysis_assume +#ifdef _PREFAST_ +#define __analysis_assume(expr) __assume(expr) +#else +#define __analysis_assume(expr) +#endif +#endif + +#ifdef __cplusplus +} +#endif + + |
