[sldev] llviewerjointmesh_sse2.cpp
Dzonatas
dzonatas at dzonux.net
Mon Jul 9 10:59:10 PDT 2007
I've had this file in my repository for awhile. It's been tested under
conditions without cache pollution and has shown to be 10x faster under
such conditions. It exploits the cache pollution problem, however. Under
the many threads that run under Second Life that create the cache
pollution, the speed of this code becomes really limited. (back to the
version found in 1.18 release)
It doesn't use globals. [See VWR-1610]
Note, this was tested with the older version of the viewer, may need to
adjust something. (then again...may not.)
=)
--
Power to Change the Void
-------------- next part --------------
/**
* @file llviewerjointmesh_sse2.cpp
* @brief LLV4 class implementation with LLViewerJointMesh class
*
* Copyright (c) 2007, Linden Research, Inc.
* License: GPLv3 -- http://www.gnu.org/copyleft/gpl.html
*/
//-----------------------------------------------------------------------------
// Header Files
//-----------------------------------------------------------------------------
// Do not use precompiled headers, because we need to build this file with
// SSE support, but not the precompiled header file. JC
#include "linden_common.h"
#include "llviewerjointmesh.h"
// project includes
#include "llface.h"
#include "llpolymesh.h"
// library includes
#include "lldarray.h"
#include "llstrider.h"
#include "llv4matrix4.h"
#include "m4math.h"
#include "v3math.h"
// *NOTE: SSE2 must be enabled for this module
#if LL_VECTORIZE
#include "emmintrin.h"
#if LL_MSVC
#pragma warning( disable : 4701 ) // "potentially uninitialized local variable" -- disabled
#endif
inline V4F32 llv4lerp(V4F32 a, V4F32 b, V4F32 w)
{
return _mm_add_ps(a, _mm_mul_ps(_mm_sub_ps(b, a), w)); // a + ( b - a ) * w
}
//-----------------------------------------------------------------------
//-----------------------------------------------------------------------
// LLV4Matrix3dx4
//-----------------------------------------------------------------------
//-----------------------------------------------------------------------
U32 lerp_comps = 0;
U32 vert_comps = 0;
U32 lerp_bits = 0;
// NOTE: LLV4Matrix3dx4 is specially designed to swizzle XYZ data streams.
LL_LLV4MATH_ALIGN_PREFIX
class LLV4Matrix3dx4
{
public:
union
{
F32 mArray[4*4*4];
F32 mRow[4][4*4];
__m64 mRow_i64[4][4*2];
__m128i mRow_i128[4][4];
struct
{
V4F32 xx, xy, xz, xw;
V4F32 yx, yy, yz, yw;
V4F32 zx, zy, zz, zw;
V4F32 wx, wy, wz, ww;
};
};
void load4VectorsAndTranspose(const LLVector3* a);
void packRowX();
void packRowY();
void packRowZ();
void packRowW();
void packRows();
void unpackRowW();
}
LL_LLV4MATH_ALIGN_POSTFIX;
#define mmShuffle(d,s,d1,d2,s3,s4) _mm_shuffle_ps(d,s,_MM_SHUFFLE(s4,s3,d2,d1))
inline void LLV4Matrix3dx4::load4VectorsAndTranspose(const LLVector3* a)
{
const V4F32 sx = _mm_loadu_ps((float*)a+0);
const V4F32 sy = _mm_loadu_ps((float*)a+4);
const V4F32 sz = _mm_loadu_ps((float*)a+8);
// {sx,sy,sz} = { x1 y1 z1 x2, y2 z2 x3 y3, z3 x4 y4 z4 }
xx = mmShuffle(sx,sx,0,0,0,3); // xx = { x1 x1 x1 x2 }
xy = mmShuffle(xx,sy,3,3,2,2); // xy = { x2 x2 x3 x3 }
xz = mmShuffle(xy,sz,2,2,1,1); // xz = { x3 x4 x4 x4 }
xz = mmShuffle(xz,xz,0,2,2,3);
yx = mmShuffle(sx,sy,1,1,0,0); // yx = { y1 y1 y1 y2 }
yx = mmShuffle(yx,yx,0,1,1,3);
yy = mmShuffle(yx,sy,3,3,3,3); // yy = { y2 y2 y3 y3 }
yz = mmShuffle(yy,sz,3,3,2,2); // yz = { y3 y4 y4 y4 }
yz = mmShuffle(yz,yz,0,2,2,3);
zx = mmShuffle(sx,sy,2,2,1,1); // zx = { z1 z1 z1 z2 }
zx = mmShuffle(zx,zx,0,1,1,3);
zy = mmShuffle(zx,sz,3,3,0,0); // zy = { z2 z2 z3 z3 }
zz = mmShuffle(sz,sz,0,3,3,3); // zz = { z3 z4 z4 z4 }
}
inline void LLV4Matrix3dx4::packRowX()
{
V4F32 vt;
vt = mmShuffle(xy,xx,0,1,2,3);
xx = mmShuffle(xx,vt,0,1,2,0);
xy = mmShuffle(xy,xz,1,2,0,1);
vt = mmShuffle(xw,xz,0,3,2,2);
xz = mmShuffle(vt,xw,2,0,1,2);
}
inline void LLV4Matrix3dx4::packRowY()
{
V4F32 vt;
vt = mmShuffle(yy,yx,0,1,2,3);
yx = mmShuffle(yx,vt,0,1,2,0);
yy = mmShuffle(yy,yz,1,2,0,1);
vt = mmShuffle(yw,yz,0,3,2,2);
yz = mmShuffle(vt,yw,2,0,1,2);
}
inline void LLV4Matrix3dx4::packRowZ()
{
V4F32 vt;
vt = mmShuffle(zy,zx,0,1,2,3);
zx = mmShuffle(zx,vt,0,1,2,0);
zy = mmShuffle(zy,zz,1,2,0,1);
vt = mmShuffle(zw,zz,0,3,2,2);
zz = mmShuffle(vt,zw,2,0,1,2);
}
inline void LLV4Matrix3dx4::packRowW()
{
V4F32 vt;
vt = mmShuffle(wy,wx,0,1,2,3);
wx = mmShuffle(wx,vt,0,1,2,0);
wy = mmShuffle(wy,wz,1,2,0,1);
vt = mmShuffle(ww,wz,0,3,2,2);
wz = mmShuffle(vt,ww,2,0,1,2);
}
inline void LLV4Matrix3dx4::packRows()
{
packRowX();
packRowY();
packRowZ();
packRowW();
}
inline void LLV4Matrix3dx4::unpackRowW()
{
ww = mmShuffle(wz,wz,1,2,3,0);
wz = mmShuffle(wy,wz,2,3,0,0);
wy = mmShuffle(wy,wx,0,1,3,0);
wy = mmShuffle(wy,wy,2,0,1,0);
}
//-----------------------------------------------------------------------
//-----------------------------------------------------------------------
// LLV4Matrix3dx4
//-----------------------------------------------------------------------
//-----------------------------------------------------------------------
//---
//-----------------------------------------------------------------------
//-----------------------------------------------------------------------
// LLViewerJointMesh::updateGeometry()
//-----------------------------------------------------------------------
//-----------------------------------------------------------------------
void update_geometry_sse2(LLFace *face, LLPolyMesh *mesh)
{
LLV4Matrix4 sJointMat[32];
//upload joint pivots/matrices
{
LLDynamicArray<LLJointRenderData*>& joint_data = mesh->getReferenceMesh()->mJointRenderData;
LLDynamicArray<LLJointRenderData*>::const_iterator i = joint_data.begin(), iend = joint_data.end();
LLV4Matrix4* mat = &sJointMat[0];
LLJointRenderData* jd = *i;
register V4F32 vw;
const LLMatrix4* w;
const LLVector3* v;
for(; jd; ++mat)
{
w = jd->mWorldMatrix;
if(jd->mSkinJoint)
{
v = &jd->mSkinJoint->mRootToJointSkinOffset;
jd = ++i < iend ? *i : NULL;
}
else
{
v = &(jd = *++i)->mSkinJoint->mRootToParentJointSkinOffset;
}
if((unsigned)w & 0xF)
{
vw = _mm_loadu_ps(w->mMatrix[VW]);
vw = _mm_add_ps(vw, _mm_mul_ps(_mm_set1_ps(v->mV[VX]), mat->mV[VX] = _mm_loadu_ps(w->mMatrix[VX])));
vw = _mm_add_ps(vw, _mm_mul_ps(_mm_set1_ps(v->mV[VY]), mat->mV[VY] = _mm_loadu_ps(w->mMatrix[VY])));
mat->mV[VW] = _mm_add_ps(vw, _mm_mul_ps(_mm_set1_ps(v->mV[VZ]), mat->mV[VZ] = _mm_loadu_ps(w->mMatrix[VZ])));
}
else
{
vw = _mm_load_ps(w->mMatrix[VW]);
vw = _mm_add_ps(vw, _mm_mul_ps(_mm_set1_ps(v->mV[VX]), mat->mV[VX] = _mm_load_ps(w->mMatrix[VX])));
vw = _mm_add_ps(vw, _mm_mul_ps(_mm_set1_ps(v->mV[VY]), mat->mV[VY] = _mm_load_ps(w->mMatrix[VY])));
mat->mV[VW] = _mm_add_ps(vw, _mm_mul_ps(_mm_set1_ps(v->mV[VZ]), mat->mV[VZ] = _mm_load_ps(w->mMatrix[VZ])));
}
}
}
//update geometry
{
LLV4Matrix3dx4 m_verts;
LLV4Matrix3dx4 m_norms;
LLV4Matrix3dx4 m_lerps;
struct vertex_buffer // *NOTE: relies on a specific vertex buffer structure
{
LLVector3 vertex;
LLVector3 normal;
};
LL_LLV4MATH_ALIGN_PREFIX
union // aligned stack for manual loop vectorization
{
__m128i vp;
struct
{
vertex_buffer* vb;
const F32* weights;
const LLVector3* coords;
const LLVector3* normals;
};
}
LL_LLV4MATH_ALIGN_POSTFIX;
U32 stride = (U32)face->mVertexBuffer->getStride();
vb = (vertex_buffer*)((U8*)face->mVertexBuffer->getMappedData() + stride * mesh->mFaceVertexOffset);
weights = mesh->getWeights();
coords = mesh->getCoords();
normals = mesh->getNormals();
LLV4Vector3 vweights;
LLV4Vector3 xweights;
xweights.v = _mm_set1_ps(F32_MAX);
vert_comps += mesh->getNumVertices();
U32 index = 0, index_end = mesh->getNumVertices();
__m128i vsz = _mm_setr_epi32(stride, sizeof(F32), sizeof(LLVector3), sizeof(LLVector3));
vsz = _mm_add_epi32(vsz,vsz);
vsz = _mm_add_epi32(vsz,vsz);
for(index = index_end/4+1; --index; vp = _mm_add_epi32(vp, vsz))
// index_end -= 4;
// for(; index < index_end; index+=4, coords+=4, normals+=4, weights+=4, vb = (vertex_buffer*)((U8*)vb+stride*4))
{
m_verts.load4VectorsAndTranspose(coords);
m_norms.load4VectorsAndTranspose(normals);
vweights.v = _mm_loadu_ps(weights);
_mm_prefetch((char*)(coords+4),_MM_HINT_NTA);
_mm_prefetch((char*)(coords+5),_MM_HINT_NTA);
_mm_prefetch((char*)(coords+6),_MM_HINT_NTA);
_mm_prefetch((char*)(normals+4),_MM_HINT_NTA);
_mm_prefetch((char*)(normals+5),_MM_HINT_NTA);
_mm_prefetch((char*)(normals+6),_MM_HINT_NTA);
_mm_prefetch((char*)(weights+4),_MM_HINT_NTA);
int r = _mm_movemask_ps(_mm_cmpeq_ps(vweights.v, xweights.v));
if(r!=0xF)
{
lerp_comps+=4;
if(!(r&1))
{
lerp_bits++;
V4F32 vi = vweights.v;
S32 joint = _mm_cvttss_si32(vi);
vi = _mm_cvtsi32_ss(vi, joint);
vi = _mm_sub_ps(vweights.v, vi);
vi = mmShuffle(vi,vi,0,0,0,0);
LLV4Matrix4* mat = &sJointMat[joint];
m_lerps.xx = llv4lerp(mat->mV[VX], (mat+1)->mV[VX], vi);
m_lerps.yx = llv4lerp(mat->mV[VY], (mat+1)->mV[VY], vi);
m_lerps.zx = llv4lerp(mat->mV[VZ], (mat+1)->mV[VZ], vi);
m_lerps.wx = llv4lerp(mat->mV[VW], (mat+1)->mV[VW], vi);
}
if(vweights.mV[VY] == vweights.mV[VX])
{
m_lerps.xy = m_lerps.xx;
m_lerps.yy = m_lerps.yx;
m_lerps.zy = m_lerps.zx;
m_lerps.wy = m_lerps.wx;
}
else
{
lerp_bits++;
V4F32 vi = vweights.v;
vi = mmShuffle(vi,vi,1,1,1,1);
S32 joint = _mm_cvttss_si32(vi);
vi = _mm_cvtsi32_ss(vi, joint);
vi = mmShuffle(vi,vi,0,0,0,0);
vi = _mm_sub_ps(vweights.v, vi);
vi = mmShuffle(vi,vi,1,1,1,1);
LLV4Matrix4* mat = &sJointMat[joint];
m_lerps.xy = llv4lerp(mat->mV[VX], (mat+1)->mV[VX], vi);
m_lerps.yy = llv4lerp(mat->mV[VY], (mat+1)->mV[VY], vi);
m_lerps.zy = llv4lerp(mat->mV[VZ], (mat+1)->mV[VZ], vi);
m_lerps.wy = llv4lerp(mat->mV[VW], (mat+1)->mV[VW], vi);
}
if(vweights.mV[VZ] == vweights.mV[VX])
{
m_lerps.xz = m_lerps.xx;
m_lerps.yz = m_lerps.yx;
m_lerps.zz = m_lerps.zx;
m_lerps.wz = m_lerps.wx;
}
else
if(vweights.mV[VZ] == vweights.mV[VY])
{
m_lerps.xz = m_lerps.xy;
m_lerps.yz = m_lerps.yy;
m_lerps.zz = m_lerps.zy;
m_lerps.wz = m_lerps.wy;
}
else
{
lerp_bits++;
V4F32 vi = vweights.v;
vi = mmShuffle(vi,vi,2,2,2,2);
S32 joint = _mm_cvttss_si32(vi);
vi = _mm_cvtsi32_ss(vi, joint);
vi = mmShuffle(vi,vi,0,0,0,0);
vi = _mm_sub_ps(vweights.v, vi);
vi = mmShuffle(vi,vi,2,2,2,2);
LLV4Matrix4* mat = &sJointMat[joint];
m_lerps.xz = llv4lerp(mat->mV[VX], (mat+1)->mV[VX], vi);
m_lerps.yz = llv4lerp(mat->mV[VY], (mat+1)->mV[VY], vi);
m_lerps.zz = llv4lerp(mat->mV[VZ], (mat+1)->mV[VZ], vi);
m_lerps.wz = llv4lerp(mat->mV[VW], (mat+1)->mV[VW], vi);
}
if(vweights.mV[VW] == vweights.mV[VX])
{
m_lerps.xw = m_lerps.xx;
m_lerps.yw = m_lerps.yx;
m_lerps.zw = m_lerps.zx;
m_lerps.ww = m_lerps.wx;
}
else
if(vweights.mV[VW] == vweights.mV[VY])
{
m_lerps.xw = m_lerps.xy;
m_lerps.yw = m_lerps.yy;
m_lerps.zw = m_lerps.zy;
m_lerps.ww = m_lerps.wy;
}
else
if(vweights.mV[VW] == vweights.mV[VZ])
{
m_lerps.xw = m_lerps.xz;
m_lerps.yw = m_lerps.yz;
m_lerps.zw = m_lerps.zz;
m_lerps.ww = m_lerps.wz;
}
else
{
lerp_bits++;
V4F32 vi = vweights.v;
vi = mmShuffle(vi,vi,3,3,3,3);
S32 joint = _mm_cvttss_si32(vi);
vi = _mm_cvtsi32_ss(vi, joint);
vi = mmShuffle(vi,vi,0,0,0,0);
vi = _mm_sub_ps(vweights.v, vi);
vi = mmShuffle(vi,vi,3,3,3,3);
LLV4Matrix4* mat = &sJointMat[joint];
m_lerps.xw = llv4lerp(mat->mV[VX], (mat+1)->mV[VX], vi);
m_lerps.yw = llv4lerp(mat->mV[VY], (mat+1)->mV[VY], vi);
m_lerps.zw = llv4lerp(mat->mV[VZ], (mat+1)->mV[VZ], vi);
m_lerps.ww = llv4lerp(mat->mV[VW], (mat+1)->mV[VW], vi);
}
m_lerps.packRows();
xweights.v = vweights.v;
}
m_verts.wx = _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(m_verts.xx, m_lerps.xx), _mm_mul_ps(m_verts.yx, m_lerps.yx)), _mm_mul_ps(m_verts.zx, m_lerps.zx)),m_lerps.wx);
m_verts.wy = _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(m_verts.xy, m_lerps.xy), _mm_mul_ps(m_verts.yy, m_lerps.yy)), _mm_mul_ps(m_verts.zy, m_lerps.zy)),m_lerps.wy);
m_verts.wz = _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(m_verts.xz, m_lerps.xz), _mm_mul_ps(m_verts.yz, m_lerps.yz)), _mm_mul_ps(m_verts.zz, m_lerps.zz)),m_lerps.wz);
m_norms.wx = _mm_add_ps(_mm_add_ps(_mm_mul_ps(m_norms.xx, m_lerps.xx), _mm_mul_ps(m_norms.yx, m_lerps.yx)), _mm_mul_ps(m_norms.zx, m_lerps.zx));
m_norms.wy = _mm_add_ps(_mm_add_ps(_mm_mul_ps(m_norms.xy, m_lerps.xy), _mm_mul_ps(m_norms.yy, m_lerps.yy)), _mm_mul_ps(m_norms.zy, m_lerps.zy));
m_norms.wz = _mm_add_ps(_mm_add_ps(_mm_mul_ps(m_norms.xz, m_lerps.xz), _mm_mul_ps(m_norms.yz, m_lerps.yz)), _mm_mul_ps(m_norms.zz, m_lerps.zz));
/*
((vertex_buffer*)((U8*)vb+stride*0))->vertex.setVec(&m_verts.mRow[VW][0]);
((vertex_buffer*)((U8*)vb+stride*1))->vertex.setVec(&m_verts.mRow[VW][3]);
((vertex_buffer*)((U8*)vb+stride*2))->vertex.setVec(&m_verts.mRow[VW][6]);
((vertex_buffer*)((U8*)vb+stride*3))->vertex.setVec(&m_verts.mRow[VW][9]);
((vertex_buffer*)((U8*)vb+stride*0))->normal.setVec(&m_norms.mRow[VW][0]);
((vertex_buffer*)((U8*)vb+stride*1))->normal.setVec(&m_norms.mRow[VW][3]);
((vertex_buffer*)((U8*)vb+stride*2))->normal.setVec(&m_norms.mRow[VW][6]);
((vertex_buffer*)((U8*)vb+stride*3))->normal.setVec(&m_norms.mRow[VW][9]);
*/
// interleave
{
V4F32 s; // v1 v2 v3 v4
m_verts.unpackRowW(); // n1 n2 n3 n4
m_norms.unpackRowW(); // ::
s = m_verts.wy; // v1 n1 v3 n3
m_verts.wy = m_norms.wx; // v2 n2 v4 n4
m_norms.wx = s;
s = m_verts.ww;
m_verts.ww = m_norms.wz;
m_norms.wz = s;
m_verts.packRowW();
m_norms.packRowW();
}
_mm_storeu_ps( ((float*)((U8*)vb+stride*0)+0), m_verts.wx); // v1x v1y v1z n1x
_mm_storel_pi((__m64*)((float*)((U8*)vb+stride*0)+4), m_verts.wy); // n1y n1z
_mm_storeu_ps( ((float*)((U8*)vb+stride*1)+0), m_norms.wx); // v2x v2y v2z n2x
_mm_storel_pi((__m64*)((float*)((U8*)vb+stride*1)+4), m_norms.wy); // n2y n2z
_mm_storeh_pi((__m64*)((float*)((U8*)vb+stride*2)+0), m_verts.wy); // v3x v3y
_mm_storeu_ps( ((float*)((U8*)vb+stride*2)+2), m_verts.wz); // v3z n3x n3y n3x
_mm_storeh_pi((__m64*)((float*)((U8*)vb+stride*3)+0), m_norms.wy); // v4x v4y
_mm_storeu_ps( ((float*)((U8*)vb+stride*3)+2), m_norms.wz); // v4z n4x n4y n4z
/*
_mm_stream_pi(((__m64*)((U8*)vb+stride*0))+0, m_verts.m64Row[VW][0]);
_mm_stream_pi(((__m64*)((U8*)vb+stride*0))+1, m_verts.m64Row[VW][1]);
_mm_stream_pi(((__m64*)((U8*)vb+stride*0))+2, m_verts.m64Row[VW][2]);
_mm_stream_pi(((__m64*)((U8*)vb+stride*1))+0, m_norms.m64Row[VW][0]);
_mm_stream_pi(((__m64*)((U8*)vb+stride*1))+1, m_norms.m64Row[VW][1]);
_mm_stream_pi(((__m64*)((U8*)vb+stride*1))+2, m_norms.m64Row[VW][2]);
_mm_stream_pi(((__m64*)((U8*)vb+stride*2))+0, m_verts.m64Row[VW][3]);
_mm_stream_pi(((__m64*)((U8*)vb+stride*2))+1, m_verts.m64Row[VW][4]);
_mm_stream_pi(((__m64*)((U8*)vb+stride*2))+2, m_verts.m64Row[VW][5]);
_mm_stream_pi(((__m64*)((U8*)vb+stride*3))+0, m_norms.m64Row[VW][3]);
_mm_stream_pi(((__m64*)((U8*)vb+stride*3))+1, m_norms.m64Row[VW][4]);
_mm_stream_pi(((__m64*)((U8*)vb+stride*3))+2, m_norms.m64Row[VW][5]);
*/
}
index = (index_end/4)*4;
LLV4Matrix4 blend_mat;
F32 weight = F32_MAX;
index_end = mesh->getNumVertices();
for (; index < index_end; ++index, ++coords, ++normals, ++weights, vb = (vertex_buffer*)((U8*)vb+stride))
{
if( weight != *weights)
{
S32 joint = S32(weight = *weights);
blend_mat.lerp(sJointMat[joint], sJointMat[joint+1], weight - joint);
}
blend_mat.multiply(*coords, vb->vertex);
((LLV4Matrix3)blend_mat).multiply(*normals, vb->normal);
}
}
//_mm_sfence();
//_mm_empty();
}
#else
void update_geometry_sse2(LLFace *face, LLPolyMesh *mesh)
{
update_geometry_vec(face, mesh);
return;
}
#endif
More information about the SLDev
mailing list