Moderators: Sascha Willems, walaber
martinsm wrote:I want to thank Julio for open-sourcing his awesome Newton physics engine. Thanks to that I've been able to port my little unfinished game to Android, and to PSP! It was really fun porting project.
martinsm wrote:Performance on high-end Android devices is more or less same as on iOS devices. On PSP situation is more dramatic. Depending on scene NewtonUpdate takes up to 200ms. That's too much. But I guess its expected - PSP has only 333Mhz MIPS CPU.
class simd_128
{
public:
DG_INLINE simd_128 () {}
DG_INLINE simd_128 (simd_type type): m_type(type) {}
DG_INLINE simd_128 (dgFloat32 a): m_type(_mm_set_ps1(a)) {}
DG_INLINE simd_128 (const simd_128& data): m_type(data.m_type) {}
DG_INLINE simd_128 (dgInt32 a): m_type (_mm_set_ps1 (*(dgFloat32*)&a)){}
DG_INLINE simd_128 (const dgFloat32* const ptr): m_type(_mm_loadu_ps (ptr)) {}
DG_INLINE simd_128 (dgFloat32 x, dgFloat32 y, dgFloat32 z, dgFloat32 w): m_type(_mm_set_ps(w, z, y, x)) {}
DG_INLINE simd_128 (dgInt32 ix, dgInt32 iy, dgInt32 iz, dgInt32 iw): m_type(_mm_set_ps(*(dgFloat32*)&iw, *(dgFloat32*)&iz, *(dgFloat32*)&iy, *(dgFloat32*)&ix)) {}
DG_INLINE dgInt32 GetInt () const
{
return _mm_cvtss_si32(m_type);
}
DG_INLINE void StoreScalar(float* const scalar) const
{
_mm_store_ss (scalar, m_type);
}
DG_INLINE void StoreVector(float* const array) const
{
_mm_storeu_ps (array, m_type);
}
DG_INLINE simd_128 operator= (const simd_128& data)
{
m_type = data.m_type;
return (*this);
}
DG_INLINE simd_128 operator+ (const simd_128& data) const
{
return _mm_add_ps (m_type, data.m_type);
}
DG_INLINE simd_128 operator- (const simd_128& data) const
{
return _mm_sub_ps (m_type, data.m_type);
}
DG_INLINE simd_128 operator* (const simd_128& data) const
{
return _mm_mul_ps (m_type, data.m_type);
}
DG_INLINE simd_128 operator/ (const simd_128& data) const
{
return _mm_div_ps (m_type, data.m_type);
}
DG_INLINE simd_128 operator<= (const simd_128& data) const
{
return _mm_cmple_ps (m_type, data.m_type);
}
DG_INLINE simd_128 operator>= (const simd_128& data) const
{
return _mm_cmpge_ps (m_type, data.m_type);
}
DG_INLINE simd_128 operator< (const simd_128& data) const
{
return _mm_cmplt_ps (m_type, data.m_type);
}
DG_INLINE simd_128 operator> (const simd_128& data) const
{
return _mm_cmpgt_ps (m_type, data.m_type);
}
DG_INLINE simd_128 operator& (const simd_128& data) const
{
return _mm_and_ps (m_type, data.m_type);
}
DG_INLINE simd_128 operator| (const simd_128& data) const
{
return _mm_or_ps (m_type, data.m_type);
}
DG_INLINE simd_128 AndNot (const simd_128& data) const
{
return _mm_andnot_ps (data.m_type, m_type);
}
DG_INLINE simd_128 AddHorizontal () const
{
simd_128 tmp (_mm_add_ps (m_type, _mm_shuffle_ps(m_type, m_type, PURMUT_MASK(2, 3, 0, 1))));
return _mm_add_ps (tmp.m_type, _mm_shuffle_ps(tmp.m_type, tmp.m_type, PURMUT_MASK(1, 0, 3, 2)));
}
DG_INLINE simd_128 DotProduct (const simd_128& data) const
{
simd_128 dot ((*this) * data);
return dot.AddHorizontal();
}
DG_INLINE simd_128 CrossProduct (const simd_128& data) const
{
return _mm_sub_ps (_mm_mul_ps (_mm_shuffle_ps (m_type, m_type, PURMUT_MASK(3, 0, 2, 1)), _mm_shuffle_ps (data.m_type, data.m_type, PURMUT_MASK(3, 1, 0, 2))),
_mm_mul_ps (_mm_shuffle_ps (m_type, m_type, PURMUT_MASK(3, 1, 0, 2)), _mm_shuffle_ps (data.m_type, data.m_type, PURMUT_MASK(3, 0, 2, 1))));
}
DG_INLINE simd_128 Abs () const
{
__m128i shitSign = _mm_srli_epi32 (_mm_slli_epi32 (*((__m128i*) &m_type), 1), 1);
return *(__m128*)&shitSign;
}
DG_INLINE simd_128 Floor () const
{
const dgFloat32 magicConst = (dgFloat32 (1.5f) * dgFloat32 (1<<23));
simd_128 mask (magicConst, magicConst, magicConst, magicConst);
simd_128 ret (_mm_sub_ps(_mm_add_ps(m_type, mask.m_type), mask.m_type));
simd_128 adjust (_mm_cmplt_ps (m_type, ret.m_type));
ret = _mm_sub_ps (ret.m_type, _mm_and_ps(_mm_set_ps1(1.0), adjust.m_type));
_ASSERTE (ret.m_type.m128_f32[0] == dgFloor(m_type.m128_f32[0]));
_ASSERTE (ret.m_type.m128_f32[1] == dgFloor(m_type.m128_f32[1]));
_ASSERTE (ret.m_type.m128_f32[2] == dgFloor(m_type.m128_f32[2]));
_ASSERTE (ret.m_type.m128_f32[3] == dgFloor(m_type.m128_f32[3]));
return ret;
}
DG_INLINE dgInt32 GetSignMask() const
{
return _mm_movemask_ps(m_type);
}
DG_INLINE simd_128 InvRqrt () const
{
simd_128 half (dgFloat32 (0.5f));
simd_128 three (dgFloat32 (3.0f));
simd_128 tmp0 (_mm_rsqrt_ps(m_type));
return half * tmp0 * (three - (*this) * tmp0 * tmp0);
}
DG_INLINE simd_128 GetMin (const simd_128& data) const
{
return _mm_min_ps (m_type, data.m_type);
}
DG_INLINE simd_128 GetMax (const simd_128& data) const
{
return _mm_max_ps (m_type, data.m_type);
}
DG_INLINE simd_128 MaximunValue() const
{
simd_128 tmp (GetMax (_mm_movehl_ps (m_type, m_type)));
return tmp.GetMax (_mm_shuffle_ps(tmp.m_type, tmp.m_type, PURMUT_MASK(0, 0, 0, 1)));
}
DG_INLINE simd_128 MoveHighToLow (const simd_128& data) const
{
return _mm_movehl_ps (m_type, data.m_type);
}
DG_INLINE simd_128 MoveLowToHigh (const simd_128& data) const
{
return _mm_movelh_ps (m_type, data.m_type);
}
DG_INLINE simd_128 PackLow (const simd_128& data) const
{
return _mm_unpacklo_ps (m_type, data.m_type);
}
DG_INLINE simd_128 PackHigh (const simd_128& data) const
{
return _mm_unpackhi_ps (m_type, data.m_type);
}
simd_type m_type;
};
martinsm wrote:Oh, and by the way during porting I found small bug in Newton. When building TreeCollision NewtonTreeCollisionEndBuild was ignoring optimize parameter. I needed to passs there false (to not to optimize), because Newton was crashing when optimize=true.
Needed change was here - dgAABBPolygonSoup.cpp file, line 739.
http://code.google.com/p/newton-dynamic ... up.cpp#739
I commented out "optimizedBuild = true;" and my game was not crashing.
Strangely, this was happening only on Android, not PC or iOS. I guess it's because of differences in architecture/floating-point operations.
Julio Jerez wrote:I was under the impression that Android can only be programmed in Java.
Julio Jerez wrote:Let us take it one line at time.
Julio Jerez wrote:I was under the impression that Android can only be programmed in Java, how did you do it so fast.
This is how the class look like so far
Yes. Newer Androids have NEON - same thing for iPhone/iPad. You can code them using inline assembly, or similar intrinsics as SSE. Both Android and iPhone/iPad's use same architecture - ARM, so bacially they are running exactly same architectures. Android NDK includes some sample projects of how to use NEON in native C/C++ code.Do the Androids support Simd instructions?
martinsm wrote:I suggest dropping reference to const simd_128& data function arguments. GCC is very picky about that. If it sees reference/pointer to SIMD type, then it will omit some pretty significant optimizations. Pass this class by value and GCC will automatically inline everything without redundant copies.
martinsm wrote:Yes. Newer Androids have NEON - same thing for iPhone/iPad. You can code them using inline assembly, or similar intrinsic as SSE.Do the Androids support Simd instructions?
Julio Jerez wrote:Oh very ineteresting, dir you mean replace the prototypes for this:
DG_INLINE simd_128 DotProduct (const simd_128& data) const;
to this:
DG_INLINE simd_128 DotProduct (const simd_128 data) const;
using const simd& data
mov eax, DWORD PTR _data$[ebp]
movaps xmm0, XMMWORD PTR [eax]
movaps xmm1, XMMWORD PTR [ecx]
mov eax, DWORD PTR ___$ReturnUdt$[ebp]
subps xmm1, xmm0
movaps XMMWORD PTR [eax], xmm1
using const simd data
movaps xmm0, XMMWORD PTR [ecx]
movaps xmm1, XMMWORD PTR _data$[ebp]
mov eax, DWORD PTR ___$ReturnUdt$[ebp]
subps xmm0, xmm1
movaps XMMWORD PTR [eax], xmm0
__declspec((align(16)) class simd_128 { ... };
Users browsing this forum: No registered users and 37 guests