diff options
| author | daoge_cmd <3523206925@qq.com> | 2026-03-01 12:16:08 +0800 |
|---|---|---|
| committer | daoge_cmd <3523206925@qq.com> | 2026-03-01 12:16:08 +0800 |
| commit | b691c43c44ff180d10e7d4a9afc83b98551ff586 (patch) | |
| tree | 3e9849222cbc6ba49f2f1fc6e5fe7179632c7390 /Minecraft.Client/PS3/PS3Extras/DirectX/DirectXMathConvert.inl | |
| parent | def8cb415354ac390b7e89052a50605285f1aca9 (diff) | |
Initial commit
Diffstat (limited to 'Minecraft.Client/PS3/PS3Extras/DirectX/DirectXMathConvert.inl')
| -rw-r--r-- | Minecraft.Client/PS3/PS3Extras/DirectX/DirectXMathConvert.inl | 1962 |
1 files changed, 1962 insertions, 0 deletions
diff --git a/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXMathConvert.inl b/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXMathConvert.inl new file mode 100644 index 00000000..c8e39352 --- /dev/null +++ b/Minecraft.Client/PS3/PS3Extras/DirectX/DirectXMathConvert.inl @@ -0,0 +1,1962 @@ +//------------------------------------------------------------------------------------- +// DirectXMathConvert.inl -- SIMD C++ Math library +// +// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF +// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A +// PARTICULAR PURPOSE. +// +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------------------------------- + +#ifdef _MSC_VER +#pragma once +#endif + +/**************************************************************************** + * + * Data conversion + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ + +#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) +// For VMX128, these routines are all defines in the main header + +#pragma warning(push) +#pragma warning(disable:4701) // Prevent warnings about 'Result' potentially being used without having been initialized + +inline XMVECTOR XMConvertVectorIntToFloat +( + FXMVECTOR VInt, + uint32_t DivExponent +) +{ + assert(DivExponent<32); +#if defined(_XM_NO_INTRINSICS_) + float fScale = 1.0f / (float)(1U << DivExponent); + uint32_t ElementIndex = 0; + XMVECTOR Result; + do { + int32_t iTemp = (int32_t)VInt.vector4_u32[ElementIndex]; + Result.vector4_f32[ElementIndex] = ((float)iTemp) * fScale; + } while (++ElementIndex<4); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vcvtq_f32_s32( VInt ); + uint32_t uScale = 0x3F800000U - (DivExponent << 23); + __n128 vScale = vdupq_n_u32( uScale ); + return vmulq_f32( vResult, vScale ); +#else // _XM_SSE_INTRINSICS_ + // Convert to floats + XMVECTOR vResult = _mm_cvtepi32_ps(_mm_castps_si128(VInt)); + // Convert DivExponent into 1.0f/(1<<DivExponent) + uint32_t uScale = 0x3F800000U - (DivExponent << 23); + // Splat the scalar value + __m128i vScale = _mm_set1_epi32(uScale); + vResult = _mm_mul_ps(vResult,_mm_castsi128_ps(vScale)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMConvertVectorFloatToInt +( + FXMVECTOR VFloat, + uint32_t MulExponent +) +{ + assert(MulExponent<32); +#if defined(_XM_NO_INTRINSICS_) + // Get the scalar factor. + float fScale = (float)(1U << MulExponent); + uint32_t ElementIndex = 0; + XMVECTOR Result; + do { + int32_t iResult; + float fTemp = VFloat.vector4_f32[ElementIndex]*fScale; + if (fTemp <= -(65536.0f*32768.0f)) { + iResult = (-0x7FFFFFFF)-1; + } else if (fTemp > (65536.0f*32768.0f)-128.0f) { + iResult = 0x7FFFFFFF; + } else { + iResult = (int32_t)fTemp; + } + Result.vector4_u32[ElementIndex] = (uint32_t)iResult; + } while (++ElementIndex<4); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vdupq_n_f32((float)(1U << MulExponent)); + vResult = vmulq_f32(vResult,VFloat); + // In case of positive overflow, detect it + __n128 vOverflow = vcgtq_f32(vResult,g_XMMaxInt); + // Float to int conversion + __n128 vResulti = vcvtq_s32_f32(vResult); + // If there was positive overflow, set to 0x7FFFFFFF + vResult = vandq_u32(vOverflow,g_XMAbsMask); + vOverflow = vbicq_u32(vResulti,vOverflow); + vOverflow = vorrq_u32(vOverflow,vResult); + return vOverflow; +#else // _XM_SSE_INTRINSICS_ + XMVECTOR vResult = _mm_set_ps1((float)(1U << MulExponent)); + vResult = _mm_mul_ps(vResult,VFloat); + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(vResult); + // If there was positive overflow, set to 0x7FFFFFFF + vResult = _mm_and_ps(vOverflow,g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti)); + vOverflow = _mm_or_ps(vOverflow,vResult); + return vOverflow; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMConvertVectorUIntToFloat +( + FXMVECTOR VUInt, + uint32_t DivExponent +) +{ + assert(DivExponent<32); +#if defined(_XM_NO_INTRINSICS_) + float fScale = 1.0f / (float)(1U << DivExponent); + uint32_t ElementIndex = 0; + XMVECTOR Result; + do { + Result.vector4_f32[ElementIndex] = (float)VUInt.vector4_u32[ElementIndex] * fScale; + } while (++ElementIndex<4); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vcvtq_f32_u32( VUInt ); + uint32_t uScale = 0x3F800000U - (DivExponent << 23); + __n128 vScale = vdupq_n_u32( uScale ); + return vmulq_f32( vResult, vScale ); +#else // _XM_SSE_INTRINSICS_ + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(VUInt,g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(VUInt,vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned); + vResult = _mm_add_ps(vResult,vMask); + // Convert DivExponent into 1.0f/(1<<DivExponent) + uint32_t uScale = 0x3F800000U - (DivExponent << 23); + // Splat + iMask = _mm_set1_epi32(uScale); + vResult = _mm_mul_ps(vResult,_mm_castsi128_ps(iMask)); + return vResult; +#endif +} + +//------------------------------------------------------------------------------ + +inline XMVECTOR XMConvertVectorFloatToUInt +( + FXMVECTOR VFloat, + uint32_t MulExponent +) +{ + assert(MulExponent<32); +#if defined(_XM_NO_INTRINSICS_) + // Get the scalar factor. + float fScale = (float)(1U << MulExponent); + uint32_t ElementIndex = 0; + XMVECTOR Result; + do { + uint32_t uResult; + float fTemp = VFloat.vector4_f32[ElementIndex]*fScale; + if (fTemp <= 0.0f) { + uResult = 0; + } else if (fTemp >= (65536.0f*65536.0f)) { + uResult = 0xFFFFFFFFU; + } else { + uResult = (uint32_t)fTemp; + } + Result.vector4_u32[ElementIndex] = uResult; + } while (++ElementIndex<4); + return Result; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 vResult = vdupq_n_f32((float)(1U << MulExponent)); + vResult = vmulq_f32(vResult,VFloat); + // In case of overflow, detect it + __n128 vOverflow = vcgtq_f32(vResult,g_XMMaxUInt); + // Float to int conversion + __n128 vResulti = vcvtq_u32_f32(vResult); + // If there was overflow, set to 0xFFFFFFFFU + vResult = vbicq_u32(vResulti,vOverflow); + vOverflow = vorrq_u32(vOverflow,vResult); + return vOverflow; +#else // _XM_SSE_INTRINSICS_ + XMVECTOR vResult = _mm_set_ps1(static_cast<float>(1U << MulExponent)); + vResult = _mm_mul_ps(vResult,VFloat); + // Clamp to >=0 + vResult = _mm_max_ps(vResult,g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue,vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult,vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask,g_XMNegativeZero); + vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult,vOverflow); + return vResult; +#endif +} + +#pragma warning(pop) + +#endif // _XM_NO_INTRINSICS_ || _XM_SSE_INTRINSICS_ || _XM_ARM_NEON_INTRINSICS_ + +/**************************************************************************** + * + * Vector and matrix load operations + * + ****************************************************************************/ + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadInt(const uint32_t* pSource) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = *pSource; + V.vector4_u32[1] = 0; + V.vector4_u32[2] = 0; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 zero = vdupq_n_u32(0); + return vld1q_lane_u32( pSource, zero, 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ss( reinterpret_cast<const float*>(pSource) ); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadFloat(const float* pSource) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = *pSource; + V.vector4_f32[1] = 0.f; + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 zero = vdupq_n_u32(0); + return vld1q_lane_f32( pSource, zero, 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ss( pSource ); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadInt2 +( + const uint32_t* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = 0; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 x = vld1_u32( pSource ); + __n64 zero = vdup_n_u32(0); + return vcombine_u32( x, zero ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pSource) ); + __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pSource+1) ); + return _mm_unpacklo_ps( x, y ); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadInt2A +( + const uint32_t* pSource +) +{ + assert(pSource); + assert(((uintptr_t)pSource & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = 0; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 x = vld1_u32_ex( pSource, 64 ); + __n64 zero = vdup_n_u32(0); + return vcombine_u32( x, zero ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) ); + return reinterpret_cast<__m128 *>(&V)[0]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadFloat2 +( + const XMFLOAT2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 x = vld1_f32( reinterpret_cast<const float*>(pSource) ); + __n64 zero = vdup_n_u32(0); + return vcombine_f32( x, zero ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( &pSource->x ); + __m128 y = _mm_load_ss( &pSource->y ); + return _mm_unpacklo_ps( x, y ); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadFloat2A +( + const XMFLOAT2A* pSource +) +{ + assert(pSource); + assert(((uintptr_t)pSource & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 x = vld1_f32_ex( reinterpret_cast<const float*>(pSource), 64 ); + __n64 zero = vdup_n_u32(0); + return vcombine_f32( x, zero ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) ); + return reinterpret_cast<__m128 *>(&V)[0]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadSInt2 +( + const XMINT2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = (float)pSource->x; + V.vector4_f32[1] = (float)pSource->y; + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 x = vld1_s32( reinterpret_cast<const int32_t*>(pSource) ); + __n64 v = vcvt_f32_s32( x ); + __n64 zero = vdup_n_u32(0); + return vcombine_s32( v, zero ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) ); + __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) ); + __m128 V = _mm_unpacklo_ps( x, y ); + return _mm_cvtepi32_ps(_mm_castps_si128(V)); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadUInt2 +( + const XMUINT2* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = (float)pSource->x; + V.vector4_f32[1] = (float)pSource->y; + V.vector4_f32[2] = 0.f; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 x = vld1_u32( reinterpret_cast<const uint32_t*>(pSource) ); + __n64 v = vcvt_f32_u32( x ); + __n64 zero = vdup_n_u32(0); + return vcombine_u32( v, zero ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) ); + __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) ); + __m128 V = _mm_unpacklo_ps( x, y ); + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(V,vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned); + vResult = _mm_add_ps(vResult,vMask); + return vResult; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadInt3 +( + const uint32_t* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 x = vld1_u32( pSource ); + __n64 zero = vdup_n_u32(0); + __n64 y = vld1_lane_u32( pSource+2, zero, 0 ); + return vcombine_u32( x, y ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pSource) ); + __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pSource+1) ); + __m128 z = _mm_load_ss( reinterpret_cast<const float*>(pSource+2) ); + __m128 xy = _mm_unpacklo_ps( x, y ); + return _mm_movelh_ps( xy, z ); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadInt3A +( + const uint32_t* pSource +) +{ + assert(pSource); + assert(((uintptr_t)pSource & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + V.vector4_u32[3] = 0; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Reads an extra integer which is zero'd + __n128 V = vld1q_u32_ex( pSource, 128 ); + return vsetq_lane_u32( 0, V, 3 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Reads an extra integer which is zero'd + __m128i V = _mm_load_si128( reinterpret_cast<const __m128i*>(pSource) ); + V = _mm_and_si128( V, g_XMMask3 ); + return reinterpret_cast<__m128 *>(&V)[0]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadFloat3 +( + const XMFLOAT3* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = pSource->z; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 x = vld1_f32( reinterpret_cast<const float*>(pSource) ); + __n64 zero = vdup_n_u32(0); + __n64 y = vld1_lane_f32( reinterpret_cast<const float*>(pSource)+2, zero, 0 ); + return vcombine_f32( x, y ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( &pSource->x ); + __m128 y = _mm_load_ss( &pSource->y ); + __m128 z = _mm_load_ss( &pSource->z ); + __m128 xy = _mm_unpacklo_ps( x, y ); + return _mm_movelh_ps( xy, z ); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadFloat3A +( + const XMFLOAT3A* pSource +) +{ + assert(pSource); + assert(((uintptr_t)pSource & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = pSource->z; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + // Reads an extra float which is zero'd + __n128 V = vld1q_f32_ex( reinterpret_cast<const float*>(pSource), 128 ); + return vsetq_lane_f32( 0, V, 3 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Reads an extra float which is zero'd + __m128 V = _mm_load_ps( &pSource->x ); + return _mm_and_ps( V, g_XMMask3 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadSInt3 +( + const XMINT3* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + V.vector4_f32[0] = (float)pSource->x; + V.vector4_f32[1] = (float)pSource->y; + V.vector4_f32[2] = (float)pSource->z; + V.vector4_f32[3] = 0.f; + return V; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 x = vld1_s32( reinterpret_cast<const int32_t*>(pSource) ); + __n64 zero = vdup_n_u32(0); + __n64 y = vld1_lane_s32( reinterpret_cast<const int32_t*>(pSource)+2, zero, 0 ); + __n128 v = vcombine_s32( x, y ); + return vcvtq_f32_s32( v ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) ); + __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) ); + __m128 z = _mm_load_ss( reinterpret_cast<const float*>(&pSource->z) ); + __m128 xy = _mm_unpacklo_ps( x, y ); + __m128 V = _mm_movelh_ps( xy, z ); + return _mm_cvtepi32_ps(_mm_castps_si128(V)); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadUInt3 +( + const XMUINT3* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = (float)pSource->x; + V.vector4_f32[1] = (float)pSource->y; + V.vector4_f32[2] = (float)pSource->z; + V.vector4_f32[3] = 0.f; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 x = vld1_u32( reinterpret_cast<const uint32_t*>(pSource) ); + __n64 zero = vdup_n_u32(0); + __n64 y = vld1_lane_u32( reinterpret_cast<const uint32_t*>(pSource)+2, zero, 0 ); + __n128 v = vcombine_u32( x, y ); + return vcvtq_f32_u32( v ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) ); + __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) ); + __m128 z = _mm_load_ss( reinterpret_cast<const float*>(&pSource->z) ); + __m128 xy = _mm_unpacklo_ps( x, y ); + __m128 V = _mm_movelh_ps( xy, z ); + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(V,vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned); + vResult = _mm_add_ps(vResult,vMask); + return vResult; + +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadInt4 +( + const uint32_t* pSource +) +{ + assert(pSource); + +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + V.vector4_u32[3] = pSource[3]; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_u32( pSource ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_loadu_si128( reinterpret_cast<const __m128i*>(pSource) ); + return reinterpret_cast<__m128 *>(&V)[0]; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadInt4A +( + const uint32_t* pSource +) +{ + assert(pSource); + assert(((uintptr_t)pSource & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_u32[0] = pSource[0]; + V.vector4_u32[1] = pSource[1]; + V.vector4_u32[2] = pSource[2]; + V.vector4_u32[3] = pSource[3]; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_u32_ex( pSource, 128 ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_load_si128( reinterpret_cast<const __m128i*>(pSource) ); + return reinterpret_cast<__m128 *>(&V)[0]; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadFloat4 +( + const XMFLOAT4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = pSource->z; + V.vector4_f32[3] = pSource->w; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_f32( reinterpret_cast<const float*>(pSource) ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_loadu_ps( &pSource->x ); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadFloat4A +( + const XMFLOAT4A* pSource +) +{ + assert(pSource); + assert(((uintptr_t)pSource & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = pSource->x; + V.vector4_f32[1] = pSource->y; + V.vector4_f32[2] = pSource->z; + V.vector4_f32[3] = pSource->w; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + return vld1q_f32_ex( reinterpret_cast<const float*>(pSource), 128 ); +#elif defined(_XM_SSE_INTRINSICS_) + return _mm_load_ps( &pSource->x ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadSInt4 +( + const XMINT4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMVECTOR V; + V.vector4_f32[0] = (float)pSource->x; + V.vector4_f32[1] = (float)pSource->y; + V.vector4_f32[2] = (float)pSource->z; + V.vector4_f32[3] = (float)pSource->w; + return V; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 v = vld1q_s32( reinterpret_cast<const int32_t*>(pSource) ); + return vcvtq_f32_s32( v ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_loadu_si128( reinterpret_cast<const __m128i*>(pSource) ); + return _mm_cvtepi32_ps(V); +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMVECTOR XMLoadUInt4 +( + const XMUINT4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + XMVECTOR V; + V.vector4_f32[0] = (float)pSource->x; + V.vector4_f32[1] = (float)pSource->y; + V.vector4_f32[2] = (float)pSource->z; + V.vector4_f32[3] = (float)pSource->w; + return V; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 v = vld1q_u32( reinterpret_cast<const uint32_t*>(pSource) ); + return vcvtq_f32_u32( v ); +#elif defined(_XM_SSE_INTRINSICS_) + __m128i V = _mm_loadu_si128( reinterpret_cast<const __m128i*>(pSource) ); + // For the values that are higher than 0x7FFFFFFF, a fixup is needed + // Determine which ones need the fix. + XMVECTOR vMask = _mm_and_ps(_mm_castsi128_ps(V),g_XMNegativeZero); + // Force all values positive + XMVECTOR vResult = _mm_xor_ps(_mm_castsi128_ps(V),vMask); + // Convert to floats + vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); + // Convert 0x80000000 -> 0xFFFFFFFF + __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31); + // For only the ones that are too big, add the fixup + vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned); + vResult = _mm_add_ps(vResult,vMask); + return vResult; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XMLoadFloat3x3 +( + const XMFLOAT3X3* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + M.r[3].vector4_f32[0] = 0.0f; + M.r[3].vector4_f32[1] = 0.0f; + M.r[3].vector4_f32[2] = 0.0f; + M.r[3].vector4_f32[3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 v0 = vld1q_f32( &pSource->m[0][0] ); + __n128 v1 = vld1q_f32( &pSource->m[1][1] ); + __n64 v2 = vcreate_f32( (uint64_t)*(const uint32_t*)&pSource->m[2][2] ); + __n128 T = vextq_f32( v0, v1, 3 ); + + XMMATRIX M; + M.r[0] = vandq_u32( v0, g_XMMask3 ); + M.r[1] = vandq_u32( T, g_XMMask3 ); + M.r[2] = vcombine_f32( vget_high_f32(v1), v2 ); + M.r[3] = g_XMIdentityR3; + return M; +#elif defined(_XM_SSE_INTRINSICS_) + __m128 Z = _mm_setzero_ps(); + + __m128 V1 = _mm_loadu_ps( &pSource->m[0][0] ); + __m128 V2 = _mm_loadu_ps( &pSource->m[1][1] ); + __m128 V3 = _mm_load_ss( &pSource->m[2][2] ); + + __m128 T1 = _mm_unpackhi_ps( V1, Z ); + __m128 T2 = _mm_unpacklo_ps( V2, Z ); + __m128 T3 = _mm_shuffle_ps( V3, T2, _MM_SHUFFLE( 0, 1, 0, 0 ) ); + __m128 T4 = _mm_movehl_ps( T2, T3 ); + __m128 T5 = _mm_movehl_ps( Z, T1 ); + + XMMATRIX M; + M.r[0] = _mm_movelh_ps( V1, T1 ); + M.r[1] = _mm_add_ps( T4, T5 ); + M.r[2] = _mm_shuffle_ps( V2, V3, _MM_SHUFFLE(1, 0, 3, 2) ); + M.r[3] = g_XMIdentityR3; + return M; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XMLoadFloat4x3 +( + const XMFLOAT4X3* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + + M.r[3].vector4_f32[0] = pSource->m[3][0]; + M.r[3].vector4_f32[1] = pSource->m[3][1]; + M.r[3].vector4_f32[2] = pSource->m[3][2]; + M.r[3].vector4_f32[3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 v0 = vld1q_f32( &pSource->m[0][0] ); + __n128 v1 = vld1q_f32( &pSource->m[1][1] ); + __n128 v2 = vld1q_f32( &pSource->m[2][2] ); + + __n128 T1 = vextq_f32( v0, v1, 3 ); + __n128 T2 = vcombine_f32( vget_high_f32(v1), vget_low_f32(v2) ); + __n128 T3 = vextq_f32( v2, v2, 1 ); + + XMMATRIX M; + M.r[0] = vandq_u32( v0, g_XMMask3 ); + M.r[1] = vandq_u32( T1, g_XMMask3 ); + M.r[2] = vandq_u32( T2, g_XMMask3 ); + M.r[3] = vsetq_lane_f32( 1.f, T3, 3 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + // Use unaligned load instructions to + // load the 12 floats + // vTemp1 = x1,y1,z1,x2 + XMVECTOR vTemp1 = _mm_loadu_ps(&pSource->m[0][0]); + // vTemp2 = y2,z2,x3,y3 + XMVECTOR vTemp2 = _mm_loadu_ps(&pSource->m[1][1]); + // vTemp4 = z3,x4,y4,z4 + XMVECTOR vTemp4 = _mm_loadu_ps(&pSource->m[2][2]); + // vTemp3 = x3,y3,z3,z3 + XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2)); + // vTemp2 = y2,z2,x2,x2 + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0)); + // vTemp2 = x2,y2,z2,z2 + vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(1,1,0,2)); + // vTemp1 = x1,y1,z1,0 + vTemp1 = _mm_and_ps(vTemp1,g_XMMask3); + // vTemp2 = x2,y2,z2,0 + vTemp2 = _mm_and_ps(vTemp2,g_XMMask3); + // vTemp3 = x3,y3,z3,0 + vTemp3 = _mm_and_ps(vTemp3,g_XMMask3); + // vTemp4i = x4,y4,z4,0 + __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4),32/8); + // vTemp4i = x4,y4,z4,1.0f + vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3); + XMMATRIX M(vTemp1, + vTemp2, + vTemp3, + _mm_castsi128_ps(vTemp4i)); + return M; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XMLoadFloat4x3A +( + const XMFLOAT4X3A* pSource +) +{ + assert(pSource); + assert(((uintptr_t)pSource & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = 0.0f; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = 0.0f; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = 0.0f; + + M.r[3].vector4_f32[0] = pSource->m[3][0]; + M.r[3].vector4_f32[1] = pSource->m[3][1]; + M.r[3].vector4_f32[2] = pSource->m[3][2]; + M.r[3].vector4_f32[3] = 1.0f; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 v0 = vld1q_f32_ex( &pSource->m[0][0], 128 ); + __n128 v1 = vld1q_f32_ex( &pSource->m[1][1], 128 ); + __n128 v2 = vld1q_f32_ex( &pSource->m[2][2], 128 ); + + __n128 T1 = vextq_f32( v0, v1, 3 ); + __n128 T2 = vcombine_f32( vget_high_f32(v1), vget_low_f32(v2) ); + __n128 T3 = vextq_f32( v2, v2, 1 ); + + XMMATRIX M; + M.r[0] = vandq_u32( v0, g_XMMask3 ); + M.r[1] = vandq_u32( T1, g_XMMask3 ); + M.r[2] = vandq_u32( T2, g_XMMask3 ); + M.r[3] = vsetq_lane_f32( 1.f, T3, 3 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + // Use aligned load instructions to + // load the 12 floats + // vTemp1 = x1,y1,z1,x2 + XMVECTOR vTemp1 = _mm_load_ps(&pSource->m[0][0]); + // vTemp2 = y2,z2,x3,y3 + XMVECTOR vTemp2 = _mm_load_ps(&pSource->m[1][1]); + // vTemp4 = z3,x4,y4,z4 + XMVECTOR vTemp4 = _mm_load_ps(&pSource->m[2][2]); + // vTemp3 = x3,y3,z3,z3 + XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2)); + // vTemp2 = y2,z2,x2,x2 + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0)); + // vTemp2 = x2,y2,z2,z2 + vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(1,1,0,2)); + // vTemp1 = x1,y1,z1,0 + vTemp1 = _mm_and_ps(vTemp1,g_XMMask3); + // vTemp2 = x2,y2,z2,0 + vTemp2 = _mm_and_ps(vTemp2,g_XMMask3); + // vTemp3 = x3,y3,z3,0 + vTemp3 = _mm_and_ps(vTemp3,g_XMMask3); + // vTemp4i = x4,y4,z4,0 + __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4),32/8); + // vTemp4i = x4,y4,z4,1.0f + vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3); + XMMATRIX M(vTemp1, + vTemp2, + vTemp3, + _mm_castsi128_ps(vTemp4i)); + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XMLoadFloat4x4 +( + const XMFLOAT4X4* pSource +) +{ + assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = pSource->m[0][3]; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = pSource->m[1][3]; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = pSource->m[2][3]; + + M.r[3].vector4_f32[0] = pSource->m[3][0]; + M.r[3].vector4_f32[1] = pSource->m[3][1]; + M.r[3].vector4_f32[2] = pSource->m[3][2]; + M.r[3].vector4_f32[3] = pSource->m[3][3]; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; + M.r[0] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_11) ); + M.r[1] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_21) ); + M.r[2] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_31) ); + M.r[3] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_41) ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_loadu_ps( &pSource->_11 ); + M.r[1] = _mm_loadu_ps( &pSource->_21 ); + M.r[2] = _mm_loadu_ps( &pSource->_31 ); + M.r[3] = _mm_loadu_ps( &pSource->_41 ); + return M; +#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline XMMATRIX XMLoadFloat4x4A +( + const XMFLOAT4X4A* pSource +) +{ + assert(pSource); + assert(((uintptr_t)pSource & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + XMMATRIX M; + M.r[0].vector4_f32[0] = pSource->m[0][0]; + M.r[0].vector4_f32[1] = pSource->m[0][1]; + M.r[0].vector4_f32[2] = pSource->m[0][2]; + M.r[0].vector4_f32[3] = pSource->m[0][3]; + + M.r[1].vector4_f32[0] = pSource->m[1][0]; + M.r[1].vector4_f32[1] = pSource->m[1][1]; + M.r[1].vector4_f32[2] = pSource->m[1][2]; + M.r[1].vector4_f32[3] = pSource->m[1][3]; + + M.r[2].vector4_f32[0] = pSource->m[2][0]; + M.r[2].vector4_f32[1] = pSource->m[2][1]; + M.r[2].vector4_f32[2] = pSource->m[2][2]; + M.r[2].vector4_f32[3] = pSource->m[2][3]; + + M.r[3].vector4_f32[0] = pSource->m[3][0]; + M.r[3].vector4_f32[1] = pSource->m[3][1]; + M.r[3].vector4_f32[2] = pSource->m[3][2]; + M.r[3].vector4_f32[3] = pSource->m[3][3]; + return M; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + XMMATRIX M; + M.r[0] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_11), 128 ); + M.r[1] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_21), 128 ); + M.r[2] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_31), 128 ); + M.r[3] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_41), 128 ); + return M; +#elif defined(_XM_SSE_INTRINSICS_) + XMMATRIX M; + M.r[0] = _mm_load_ps( &pSource->_11 ); + M.r[1] = _mm_load_ps( &pSource->_21 ); + M.r[2] = _mm_load_ps( &pSource->_31 ); + M.r[3] = _mm_load_ps( &pSource->_41 ); + return M; +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +/**************************************************************************** + * + * Vector and matrix store operations + * + ****************************************************************************/ +_Use_decl_annotations_ +inline void XMStoreInt +( + uint32_t* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + *pDestination = XMVectorGetIntX( V ); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_u32( pDestination, V, 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ss( reinterpret_cast<float*>(pDestination), V ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreFloat +( + float* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + *pDestination = XMVectorGetX( V ); +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_lane_f32( pDestination, V, 0 ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ss( pDestination, V ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreInt2 +( + uint32_t* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 VL = vget_low_u32(V); + vst1_u32( pDestination, VL ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) ); + _mm_store_ss( reinterpret_cast<float*>(&pDestination[0]), V ); + _mm_store_ss( reinterpret_cast<float*>(&pDestination[1]), T ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreInt2A +( + uint32_t* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + assert(((uintptr_t)pDestination & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 VL = vget_low_u32(V); + vst1_u32_ex( pDestination, VL, 64 ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreFloat2 +( + XMFLOAT2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 VL = vget_low_f32(V); + vst1_f32( reinterpret_cast<float*>(pDestination), VL ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) ); + _mm_store_ss( &pDestination->x, V ); + _mm_store_ss( &pDestination->y, T ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreFloat2A +( + XMFLOAT2A* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + assert(((uintptr_t)pDestination & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 VL = vget_low_f32(V); + vst1_f32_ex( reinterpret_cast<float*>(pDestination), VL, 64 ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreSInt2 +( + XMINT2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = (int32_t)V.vector4_f32[0]; + pDestination->y = (int32_t)V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 v = vget_low_s32(V); + v = vcvt_s32_f32( v ); + vst1_s32( reinterpret_cast<int32_t*>(pDestination), v ); +#elif defined(_XM_SSE_INTRINSICS_) + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(V); + // If there was positive overflow, set to 0x7FFFFFFF + XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti)); + vOverflow = _mm_or_ps(vOverflow,vResult); + // Write two ints + XMVECTOR T = XM_PERMUTE_PS( vOverflow, _MM_SHUFFLE( 1, 1, 1, 1 ) ); + _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vOverflow ); + _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreUInt2 +( + XMUINT2* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = (uint32_t)V.vector4_f32[0]; + pDestination->y = (uint32_t)V.vector4_f32[1]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 v = vget_low_u32(V); + v = vcvt_u32_f32( v ); + vst1_u32( reinterpret_cast<uint32_t*>(pDestination), v ); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to >=0 + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue,vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult,vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask,g_XMNegativeZero); + vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult,vOverflow); + // Write two uints + XMVECTOR T = XM_PERMUTE_PS( vResult, _MM_SHUFFLE( 1, 1, 1, 1 ) ); + _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vResult ); + _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreInt3 +( + uint32_t* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 VL = vget_low_u32(V); + vst1_u32( pDestination, VL ); + vst1q_lane_u32( pDestination+2, V, 2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss( reinterpret_cast<float*>(pDestination), V ); + _mm_store_ss( reinterpret_cast<float*>(&pDestination[1]), T1 ); + _mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), T2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreInt3A +( + uint32_t* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + assert(((uintptr_t)pDestination & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 VL = vget_low_u32(V); + vst1_u32_ex( pDestination, VL, 64 ); + vst1q_lane_u32( pDestination+2, V, 2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); + _mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), T ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreFloat3 +( + XMFLOAT3* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 VL = vget_low_f32(V); + vst1_f32( reinterpret_cast<float*>(pDestination), VL ); + vst1q_lane_f32( reinterpret_cast<float*>(pDestination)+2, V, 2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss( &pDestination->x, V ); + _mm_store_ss( &pDestination->y, T1 ); + _mm_store_ss( &pDestination->z, T2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreFloat3A +( + XMFLOAT3A* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + assert(((uintptr_t)pDestination & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n64 VL = vget_low_f32(V); + vst1_f32_ex( reinterpret_cast<float*>(pDestination), VL, 64 ); + vst1q_lane_f32( reinterpret_cast<float*>(pDestination)+2, V, 2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); + _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); + _mm_store_ss( &pDestination->z, T ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreSInt3 +( + XMINT3* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = (int32_t)V.vector4_f32[0]; + pDestination->y = (int32_t)V.vector4_f32[1]; + pDestination->z = (int32_t)V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 v = vcvtq_s32_f32(V); + __n64 vL = vget_low_s32(v); + vst1_s32( reinterpret_cast<int32_t*>(pDestination), vL ); + vst1q_lane_s32( reinterpret_cast<int32_t*>(pDestination)+2, v, 2 ); +#elif defined(_XM_SSE_INTRINSICS_) + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(V); + // If there was positive overflow, set to 0x7FFFFFFF + XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti)); + vOverflow = _mm_or_ps(vOverflow,vResult); + // Write 3 uints + XMVECTOR T1 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR T2 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vOverflow ); + _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T1 ); + _mm_store_ss( reinterpret_cast<float*>(&pDestination->z), T2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreUInt3 +( + XMUINT3* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = (uint32_t)V.vector4_f32[0]; + pDestination->y = (uint32_t)V.vector4_f32[1]; + pDestination->z = (uint32_t)V.vector4_f32[2]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 v = vcvtq_u32_f32(V); + __n64 vL = vget_low_u32(v); + vst1_u32( reinterpret_cast<uint32_t*>(pDestination), vL ); + vst1q_lane_u32( reinterpret_cast<uint32_t*>(pDestination)+2, v, 2 ); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to >=0 + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue,vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult,vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask,g_XMNegativeZero); + vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult,vOverflow); + // Write 3 uints + XMVECTOR T1 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(1,1,1,1)); + XMVECTOR T2 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vResult ); + _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T1 ); + _mm_store_ss( reinterpret_cast<float*>(&pDestination->z), T2 ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreInt4 +( + uint32_t* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; + pDestination[3] = V.vector4_u32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_u32( pDestination, V ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreInt4A +( + uint32_t* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + assert(((uintptr_t)pDestination & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination[0] = V.vector4_u32[0]; + pDestination[1] = V.vector4_u32[1]; + pDestination[2] = V.vector4_u32[2]; + pDestination[3] = V.vector4_u32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_u32_ex( pDestination, V, 128 ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreFloat4 +( + XMFLOAT4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; + pDestination->w = V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_f32( reinterpret_cast<float*>(pDestination), V ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_storeu_ps( &pDestination->x, V ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreFloat4A +( + XMFLOAT4A* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); + assert(((uintptr_t)pDestination & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = V.vector4_f32[0]; + pDestination->y = V.vector4_f32[1]; + pDestination->z = V.vector4_f32[2]; + pDestination->w = V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_f32_ex( reinterpret_cast<float*>(pDestination), V, 128 ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ps( &pDestination->x, V ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreSInt4 +( + XMINT4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = (int32_t)V.vector4_f32[0]; + pDestination->y = (int32_t)V.vector4_f32[1]; + pDestination->z = (int32_t)V.vector4_f32[2]; + pDestination->w = (int32_t)V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 v = vcvtq_s32_f32(V); + vst1q_s32( reinterpret_cast<int32_t*>(pDestination), v ); +#elif defined(_XM_SSE_INTRINSICS_) + // In case of positive overflow, detect it + XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt); + // Float to int conversion + __m128i vResulti = _mm_cvttps_epi32(V); + // If there was positive overflow, set to 0x7FFFFFFF + XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask); + vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti)); + vOverflow = _mm_or_ps(vOverflow,vResult); + _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vOverflow) ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreUInt4 +( + XMUINT4* pDestination, + FXMVECTOR V +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + pDestination->x = (uint32_t)V.vector4_f32[0]; + pDestination->y = (uint32_t)V.vector4_f32[1]; + pDestination->z = (uint32_t)V.vector4_f32[2]; + pDestination->w = (uint32_t)V.vector4_f32[3]; +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 v = vcvtq_u32_f32(V); + vst1q_u32( reinterpret_cast<uint32_t*>(pDestination), v ); +#elif defined(_XM_SSE_INTRINSICS_) + // Clamp to >=0 + XMVECTOR vResult = _mm_max_ps(V,g_XMZero); + // Any numbers that are too big, set to 0xFFFFFFFFU + XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt); + XMVECTOR vValue = g_XMUnsignedFix; + // Too large for a signed integer? + XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue); + // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise + vValue = _mm_and_ps(vValue,vMask); + // Perform fixup only on numbers too large (Keeps low bit precision) + vResult = _mm_sub_ps(vResult,vValue); + __m128i vResulti = _mm_cvttps_epi32(vResult); + // Convert from signed to unsigned pnly if greater than 0x80000000 + vMask = _mm_and_ps(vMask,g_XMNegativeZero); + vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask); + // On those that are too large, set to 0xFFFFFFFF + vResult = _mm_or_ps(vResult,vOverflow); + _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vResult) ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreFloat3x3 +( + XMFLOAT3X3* pDestination, + CXMMATRIX M +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 T1 = vextq_f32( M.r[0], M.r[1], 1 ); + __n128 T2 = vbslq_f32( g_XMMask3, M.r[0], T1 ); + vst1q_f32( &pDestination->m[0][0], T2 ); + + T1 = vextq_f32( M.r[1], M.r[1], 1 ); + T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) ); + vst1q_f32( &pDestination->m[1][1], T2 ); + + vst1q_lane_f32( &pDestination->m[2][2], M.r[2], 2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp1 = M.r[0]; + XMVECTOR vTemp2 = M.r[1]; + XMVECTOR vTemp3 = M.r[2]; + XMVECTOR vWork = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,0,2,2)); + vTemp1 = _mm_shuffle_ps(vTemp1,vWork,_MM_SHUFFLE(2,0,1,0)); + _mm_storeu_ps(&pDestination->m[0][0],vTemp1); + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1)); + _mm_storeu_ps(&pDestination->m[1][1],vTemp2); + vTemp3 = XM_PERMUTE_PS(vTemp3,_MM_SHUFFLE(2,2,2,2)); + _mm_store_ss(&pDestination->m[2][2],vTemp3); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreFloat4x3 +( + XMFLOAT4X3* pDestination, + CXMMATRIX M +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 T1 = vextq_f32( M.r[0], M.r[1], 1 ); + __n128 T2 = vbslq_f32( g_XMMask3, M.r[0], T1 ); + vst1q_f32( &pDestination->m[0][0], T2 ); + + T1 = vextq_f32( M.r[1], M.r[1], 1 ); + T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) ); + vst1q_f32( &pDestination->m[1][1], T2 ); + + T1 = vdupq_lane_f32( vget_high_f32( M.r[2] ), 0 ); + T2 = vextq_f32( T1, M.r[3], 3 ); + vst1q_f32( &pDestination->m[2][2], T2 ); +#elif defined(_XM_SSE_INTRINSICS_) + XMVECTOR vTemp1 = M.r[0]; + XMVECTOR vTemp2 = M.r[1]; + XMVECTOR vTemp3 = M.r[2]; + XMVECTOR vTemp4 = M.r[3]; + XMVECTOR vTemp2x = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1)); + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(2,2,0,0)); + vTemp1 = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,2,1,0)); + vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2)); + vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0)); + _mm_storeu_ps(&pDestination->m[0][0],vTemp1); + _mm_storeu_ps(&pDestination->m[1][1],vTemp2x); + _mm_storeu_ps(&pDestination->m[2][2],vTemp3); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreFloat4x3A +( + XMFLOAT4X3A* pDestination, + CXMMATRIX M +) +{ + assert(pDestination); + assert(((uintptr_t)pDestination & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + __n128 T1 = vextq_f32( M.r[0], M.r[1], 1 ); + __n128 T2 = vbslq_f32( g_XMMask3, M.r[0], T1 ); + vst1q_f32_ex( &pDestination->m[0][0], T2, 128 ); + + T1 = vextq_f32( M.r[1], M.r[1], 1 ); + T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) ); + vst1q_f32_ex( &pDestination->m[1][1], T2, 128 ); + + T1 = vdupq_lane_f32( vget_high_f32( M.r[2] ), 0 ); + T2 = vextq_f32( T1, M.r[3], 3 ); + vst1q_f32_ex( &pDestination->m[2][2], T2, 128 ); +#elif defined(_XM_SSE_INTRINSICS_) + // x1,y1,z1,w1 + XMVECTOR vTemp1 = M.r[0]; + // x2,y2,z2,w2 + XMVECTOR vTemp2 = M.r[1]; + // x3,y3,z3,w3 + XMVECTOR vTemp3 = M.r[2]; + // x4,y4,z4,w4 + XMVECTOR vTemp4 = M.r[3]; + // z1,z1,x2,y2 + XMVECTOR vTemp = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(1,0,2,2)); + // y2,z2,x3,y3 (Final) + vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1)); + // x1,y1,z1,x2 (Final) + vTemp1 = _mm_shuffle_ps(vTemp1,vTemp,_MM_SHUFFLE(2,0,1,0)); + // z3,z3,x4,x4 + vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2)); + // z3,x4,y4,z4 (Final) + vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0)); + // Store in 3 operations + _mm_store_ps(&pDestination->m[0][0],vTemp1); + _mm_store_ps(&pDestination->m[1][1],vTemp2); + _mm_store_ps(&pDestination->m[2][2],vTemp3); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreFloat4x4 +( + XMFLOAT4X4* pDestination, + CXMMATRIX M +) +{ + assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + pDestination->m[0][3] = M.r[0].vector4_f32[3]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + pDestination->m[1][3] = M.r[1].vector4_f32[3]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + pDestination->m[2][3] = M.r[2].vector4_f32[3]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + pDestination->m[3][3] = M.r[3].vector4_f32[3]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_f32( reinterpret_cast<float*>(&pDestination->_11), M.r[0] ); + vst1q_f32( reinterpret_cast<float*>(&pDestination->_21), M.r[1] ); + vst1q_f32( reinterpret_cast<float*>(&pDestination->_31), M.r[2] ); + vst1q_f32( reinterpret_cast<float*>(&pDestination->_41), M.r[3] ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_storeu_ps( &pDestination->_11, M.r[0] ); + _mm_storeu_ps( &pDestination->_21, M.r[1] ); + _mm_storeu_ps( &pDestination->_31, M.r[2] ); + _mm_storeu_ps( &pDestination->_41, M.r[3] ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + +//------------------------------------------------------------------------------ +_Use_decl_annotations_ +inline void XMStoreFloat4x4A +( + XMFLOAT4X4A* pDestination, + CXMMATRIX M +) +{ + assert(pDestination); + assert(((uintptr_t)pDestination & 0xF) == 0); +#if defined(_XM_NO_INTRINSICS_) + + pDestination->m[0][0] = M.r[0].vector4_f32[0]; + pDestination->m[0][1] = M.r[0].vector4_f32[1]; + pDestination->m[0][2] = M.r[0].vector4_f32[2]; + pDestination->m[0][3] = M.r[0].vector4_f32[3]; + + pDestination->m[1][0] = M.r[1].vector4_f32[0]; + pDestination->m[1][1] = M.r[1].vector4_f32[1]; + pDestination->m[1][2] = M.r[1].vector4_f32[2]; + pDestination->m[1][3] = M.r[1].vector4_f32[3]; + + pDestination->m[2][0] = M.r[2].vector4_f32[0]; + pDestination->m[2][1] = M.r[2].vector4_f32[1]; + pDestination->m[2][2] = M.r[2].vector4_f32[2]; + pDestination->m[2][3] = M.r[2].vector4_f32[3]; + + pDestination->m[3][0] = M.r[3].vector4_f32[0]; + pDestination->m[3][1] = M.r[3].vector4_f32[1]; + pDestination->m[3][2] = M.r[3].vector4_f32[2]; + pDestination->m[3][3] = M.r[3].vector4_f32[3]; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_11), M.r[0], 128 ); + vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_21), M.r[1], 128 ); + vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_31), M.r[2], 128 ); + vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_41), M.r[3], 128 ); +#elif defined(_XM_SSE_INTRINSICS_) + _mm_store_ps( &pDestination->_11, M.r[0] ); + _mm_store_ps( &pDestination->_21, M.r[1] ); + _mm_store_ps( &pDestination->_31, M.r[2] ); + _mm_store_ps( &pDestination->_41, M.r[3] ); +#else // _XM_VMX128_INTRINSICS_ +#endif // _XM_VMX128_INTRINSICS_ +} + |
