[sldev] llviewerjointmesh_sse2.cpp

Dzonatas dzonatas at dzonux.net
Mon Jul 9 10:59:10 PDT 2007


I've had this file in my repository for awhile. It's been tested under 
conditions without cache pollution and has shown to be 10x faster under 
such conditions. It exploits the cache pollution problem, however. Under 
the many threads that run under Second Life that create the cache 
pollution, the speed of this code becomes really limited. (back to the 
version found in 1.18 release)

It doesn't use globals. [See VWR-1610]

Note, this was tested with the older version of the viewer, may need to 
adjust something. (then again...may not.)

=)

-- 
Power to Change the Void
-------------- next part --------------
/** 
 * @file llviewerjointmesh_sse2.cpp
 * @brief LLV4 class implementation with LLViewerJointMesh class
 *
 * Copyright (c) 2007, Linden Research, Inc.
 * License: GPLv3 -- http://www.gnu.org/copyleft/gpl.html
 */

//-----------------------------------------------------------------------------
// Header Files
//-----------------------------------------------------------------------------

// Do not use precompiled headers, because we need to build this file with
// SSE support, but not the precompiled header file. JC
#include "linden_common.h"

#include "llviewerjointmesh.h"

// project includes
#include "llface.h"
#include "llpolymesh.h"

// library includes
#include "lldarray.h"
#include "llstrider.h"
#include "llv4matrix4.h"
#include "m4math.h"
#include "v3math.h"

// *NOTE: SSE2 must be enabled for this module

#if LL_VECTORIZE

#include "emmintrin.h"


#if LL_MSVC
#pragma warning( disable     : 4701 )	// "potentially uninitialized local variable"  -- disabled
#endif

inline V4F32 llv4lerp(V4F32 a, V4F32 b, V4F32 w)
{
	return _mm_add_ps(a, _mm_mul_ps(_mm_sub_ps(b, a), w)); // a + ( b - a ) * w
}

//-----------------------------------------------------------------------
//-----------------------------------------------------------------------
// LLV4Matrix3dx4
//-----------------------------------------------------------------------
//-----------------------------------------------------------------------


U32 lerp_comps = 0;
U32 vert_comps = 0;
U32 lerp_bits = 0;


// NOTE: LLV4Matrix3dx4 is specially designed to swizzle XYZ data streams.

LL_LLV4MATH_ALIGN_PREFIX

class LLV4Matrix3dx4
{
public:
	union
	{
		F32			mArray[4*4*4];
		F32			mRow[4][4*4];
		__m64		mRow_i64[4][4*2];
		__m128i		mRow_i128[4][4];
		struct
		{
			V4F32	xx, xy, xz, xw;
			V4F32	yx, yy, yz, yw;
			V4F32	zx, zy, zz, zw;
			V4F32	wx, wy, wz, ww;
		};
	};

	void			load4VectorsAndTranspose(const LLVector3* a);
	void			packRowX();
	void			packRowY();
	void			packRowZ();
	void			packRowW();
	void			packRows();
	void			unpackRowW();
}

LL_LLV4MATH_ALIGN_POSTFIX;


#define mmShuffle(d,s,d1,d2,s3,s4)	_mm_shuffle_ps(d,s,_MM_SHUFFLE(s4,s3,d2,d1))

inline void LLV4Matrix3dx4::load4VectorsAndTranspose(const LLVector3* a)
{
	const V4F32 sx = _mm_loadu_ps((float*)a+0);
	const V4F32 sy = _mm_loadu_ps((float*)a+4);
	const V4F32 sz = _mm_loadu_ps((float*)a+8);
									// {sx,sy,sz} = { x1 y1 z1 x2, y2 z2 x3 y3, z3 x4 y4 z4 }
	xx = mmShuffle(sx,sx,0,0,0,3);	// xx = { x1 x1 x1 x2 }
	xy = mmShuffle(xx,sy,3,3,2,2);	// xy = { x2 x2 x3 x3 }
	xz = mmShuffle(xy,sz,2,2,1,1);	// xz = { x3 x4 x4 x4 }
	xz = mmShuffle(xz,xz,0,2,2,3);

	yx = mmShuffle(sx,sy,1,1,0,0);	// yx = { y1 y1 y1 y2 }
	yx = mmShuffle(yx,yx,0,1,1,3);
	yy = mmShuffle(yx,sy,3,3,3,3);	// yy = { y2 y2 y3 y3 }
	yz = mmShuffle(yy,sz,3,3,2,2);	// yz = { y3 y4 y4 y4 }
	yz = mmShuffle(yz,yz,0,2,2,3);

	zx = mmShuffle(sx,sy,2,2,1,1);	// zx = { z1 z1 z1 z2 }
	zx = mmShuffle(zx,zx,0,1,1,3);
	zy = mmShuffle(zx,sz,3,3,0,0);	// zy = { z2 z2 z3 z3 }
	zz = mmShuffle(sz,sz,0,3,3,3);	// zz = { z3 z4 z4 z4 }
}

inline void LLV4Matrix3dx4::packRowX()
{
	V4F32 vt;
	vt = mmShuffle(xy,xx,0,1,2,3);
	xx = mmShuffle(xx,vt,0,1,2,0);
	xy = mmShuffle(xy,xz,1,2,0,1);
	vt = mmShuffle(xw,xz,0,3,2,2);
	xz = mmShuffle(vt,xw,2,0,1,2);
}

inline void LLV4Matrix3dx4::packRowY()
{
	V4F32 vt;
	vt = mmShuffle(yy,yx,0,1,2,3);
	yx = mmShuffle(yx,vt,0,1,2,0);
	yy = mmShuffle(yy,yz,1,2,0,1);
	vt = mmShuffle(yw,yz,0,3,2,2);
	yz = mmShuffle(vt,yw,2,0,1,2);
}

inline void LLV4Matrix3dx4::packRowZ()
{
	V4F32 vt;
	vt = mmShuffle(zy,zx,0,1,2,3);
	zx = mmShuffle(zx,vt,0,1,2,0);
	zy = mmShuffle(zy,zz,1,2,0,1);
	vt = mmShuffle(zw,zz,0,3,2,2);
	zz = mmShuffle(vt,zw,2,0,1,2);
}

inline void LLV4Matrix3dx4::packRowW()
{
	V4F32 vt;
	vt = mmShuffle(wy,wx,0,1,2,3);
	wx = mmShuffle(wx,vt,0,1,2,0);
	wy = mmShuffle(wy,wz,1,2,0,1);
	vt = mmShuffle(ww,wz,0,3,2,2);
	wz = mmShuffle(vt,ww,2,0,1,2);
}

inline void LLV4Matrix3dx4::packRows()
{
	packRowX();
	packRowY();
	packRowZ();
	packRowW();
}

inline void LLV4Matrix3dx4::unpackRowW()
{
	ww = mmShuffle(wz,wz,1,2,3,0);
	wz = mmShuffle(wy,wz,2,3,0,0);
	wy = mmShuffle(wy,wx,0,1,3,0);
	wy = mmShuffle(wy,wy,2,0,1,0);
}


//-----------------------------------------------------------------------
//-----------------------------------------------------------------------
// LLV4Matrix3dx4
//-----------------------------------------------------------------------
//-----------------------------------------------------------------------

//---

//-----------------------------------------------------------------------
//-----------------------------------------------------------------------
// LLViewerJointMesh::updateGeometry()
//-----------------------------------------------------------------------
//-----------------------------------------------------------------------

void update_geometry_sse2(LLFace *face, LLPolyMesh *mesh)
{
	LLV4Matrix4		sJointMat[32];

	//upload joint pivots/matrices
	{
		LLDynamicArray<LLJointRenderData*>& joint_data = mesh->getReferenceMesh()->mJointRenderData;
		LLDynamicArray<LLJointRenderData*>::const_iterator i = joint_data.begin(), iend = joint_data.end();
		LLV4Matrix4*		mat		= &sJointMat[0];
		LLJointRenderData*	jd		= *i;
		register V4F32		vw;
		const LLMatrix4*	w;
		const LLVector3*	v;
		for(; jd; ++mat)
		{
			w = jd->mWorldMatrix;
			if(jd->mSkinJoint)
			{
				v = &jd->mSkinJoint->mRootToJointSkinOffset;
				jd = ++i < iend ? *i : NULL;
			}
			else
			{
				v = &(jd = *++i)->mSkinJoint->mRootToParentJointSkinOffset;
			}
			if((unsigned)w & 0xF)
			{
				vw = _mm_loadu_ps(w->mMatrix[VW]);
				vw			= _mm_add_ps(vw, _mm_mul_ps(_mm_set1_ps(v->mV[VX]), mat->mV[VX] = _mm_loadu_ps(w->mMatrix[VX])));
				vw			= _mm_add_ps(vw, _mm_mul_ps(_mm_set1_ps(v->mV[VY]), mat->mV[VY] = _mm_loadu_ps(w->mMatrix[VY])));
				mat->mV[VW] = _mm_add_ps(vw, _mm_mul_ps(_mm_set1_ps(v->mV[VZ]), mat->mV[VZ] = _mm_loadu_ps(w->mMatrix[VZ])));
			}
			else
			{
				vw = _mm_load_ps(w->mMatrix[VW]);
				vw			= _mm_add_ps(vw, _mm_mul_ps(_mm_set1_ps(v->mV[VX]), mat->mV[VX] = _mm_load_ps(w->mMatrix[VX])));
				vw			= _mm_add_ps(vw, _mm_mul_ps(_mm_set1_ps(v->mV[VY]), mat->mV[VY] = _mm_load_ps(w->mMatrix[VY])));
				mat->mV[VW] = _mm_add_ps(vw, _mm_mul_ps(_mm_set1_ps(v->mV[VZ]), mat->mV[VZ] = _mm_load_ps(w->mMatrix[VZ])));
			}
		}
	}

	//update geometry
	{
		LLV4Matrix3dx4	m_verts;
		LLV4Matrix3dx4	m_norms;
		LLV4Matrix3dx4	m_lerps;
		
		struct vertex_buffer		// *NOTE: relies on a specific vertex buffer structure
		{
			LLVector3		vertex;
			LLVector3		normal;
		};

		LL_LLV4MATH_ALIGN_PREFIX
		union						// aligned stack for manual loop vectorization
		{
			__m128i					vp;
			struct
			{
				vertex_buffer*		vb;
				const F32*			weights;
				const LLVector3*	coords;
				const LLVector3*	normals;
			};
		}
		LL_LLV4MATH_ALIGN_POSTFIX;
		
		U32				stride			= (U32)face->mVertexBuffer->getStride();

		vb				= (vertex_buffer*)((U8*)face->mVertexBuffer->getMappedData() + stride * mesh->mFaceVertexOffset);
		weights			= mesh->getWeights();
		coords			= mesh->getCoords();
		normals			= mesh->getNormals();

		LLV4Vector3		vweights;
		LLV4Vector3		xweights;
		xweights.v		= _mm_set1_ps(F32_MAX);

vert_comps += mesh->getNumVertices();

		U32 index = 0, index_end = mesh->getNumVertices();

		__m128i vsz = _mm_setr_epi32(stride, sizeof(F32), sizeof(LLVector3), sizeof(LLVector3));
		vsz = _mm_add_epi32(vsz,vsz);
		vsz = _mm_add_epi32(vsz,vsz);
		for(index = index_end/4+1; --index; vp = _mm_add_epi32(vp, vsz))

//		index_end -= 4;
//		for(; index < index_end; index+=4, coords+=4, normals+=4, weights+=4, vb = (vertex_buffer*)((U8*)vb+stride*4))
		{
			m_verts.load4VectorsAndTranspose(coords);
			m_norms.load4VectorsAndTranspose(normals);
			vweights.v = _mm_loadu_ps(weights);
			_mm_prefetch((char*)(coords+4),_MM_HINT_NTA);
			_mm_prefetch((char*)(coords+5),_MM_HINT_NTA);
			_mm_prefetch((char*)(coords+6),_MM_HINT_NTA);
			_mm_prefetch((char*)(normals+4),_MM_HINT_NTA);
			_mm_prefetch((char*)(normals+5),_MM_HINT_NTA);
			_mm_prefetch((char*)(normals+6),_MM_HINT_NTA);
			_mm_prefetch((char*)(weights+4),_MM_HINT_NTA);
			int r = _mm_movemask_ps(_mm_cmpeq_ps(vweights.v, xweights.v));
			if(r!=0xF)
			{
				lerp_comps+=4;
				if(!(r&1))
				{
					lerp_bits++;
					V4F32 vi = vweights.v;
					S32 joint = _mm_cvttss_si32(vi);
					vi =  _mm_cvtsi32_ss(vi, joint);
					vi = _mm_sub_ps(vweights.v, vi);
					vi = mmShuffle(vi,vi,0,0,0,0);
					LLV4Matrix4* mat = &sJointMat[joint];
					m_lerps.xx = llv4lerp(mat->mV[VX], (mat+1)->mV[VX], vi);
					m_lerps.yx = llv4lerp(mat->mV[VY], (mat+1)->mV[VY], vi);
					m_lerps.zx = llv4lerp(mat->mV[VZ], (mat+1)->mV[VZ], vi);
					m_lerps.wx = llv4lerp(mat->mV[VW], (mat+1)->mV[VW], vi);
				}
				if(vweights.mV[VY] == vweights.mV[VX])
				{
					m_lerps.xy = m_lerps.xx;
					m_lerps.yy = m_lerps.yx;
					m_lerps.zy = m_lerps.zx;
					m_lerps.wy = m_lerps.wx;
				}
				else
				{
					lerp_bits++;
					V4F32 vi = vweights.v;
					vi = mmShuffle(vi,vi,1,1,1,1);
					S32 joint = _mm_cvttss_si32(vi);
					vi =  _mm_cvtsi32_ss(vi, joint);
					vi = mmShuffle(vi,vi,0,0,0,0);
					vi = _mm_sub_ps(vweights.v, vi);
					vi = mmShuffle(vi,vi,1,1,1,1);
					LLV4Matrix4* mat = &sJointMat[joint];
					m_lerps.xy = llv4lerp(mat->mV[VX], (mat+1)->mV[VX], vi);
					m_lerps.yy = llv4lerp(mat->mV[VY], (mat+1)->mV[VY], vi);
					m_lerps.zy = llv4lerp(mat->mV[VZ], (mat+1)->mV[VZ], vi);
					m_lerps.wy = llv4lerp(mat->mV[VW], (mat+1)->mV[VW], vi);
				}
				if(vweights.mV[VZ] == vweights.mV[VX])
				{
					m_lerps.xz = m_lerps.xx;
					m_lerps.yz = m_lerps.yx;
					m_lerps.zz = m_lerps.zx;
					m_lerps.wz = m_lerps.wx;
				}
				else
				if(vweights.mV[VZ] == vweights.mV[VY])
				{
					m_lerps.xz = m_lerps.xy;
					m_lerps.yz = m_lerps.yy;
					m_lerps.zz = m_lerps.zy;
					m_lerps.wz = m_lerps.wy;
				}
				else
				{
					lerp_bits++;
					V4F32 vi = vweights.v;
					vi = mmShuffle(vi,vi,2,2,2,2);
					S32 joint = _mm_cvttss_si32(vi);
					vi =  _mm_cvtsi32_ss(vi, joint);
					vi = mmShuffle(vi,vi,0,0,0,0);
					vi = _mm_sub_ps(vweights.v, vi);
					vi = mmShuffle(vi,vi,2,2,2,2);
					LLV4Matrix4* mat = &sJointMat[joint];
					m_lerps.xz = llv4lerp(mat->mV[VX], (mat+1)->mV[VX], vi);
					m_lerps.yz = llv4lerp(mat->mV[VY], (mat+1)->mV[VY], vi);
					m_lerps.zz = llv4lerp(mat->mV[VZ], (mat+1)->mV[VZ], vi);
					m_lerps.wz = llv4lerp(mat->mV[VW], (mat+1)->mV[VW], vi);
				}
				if(vweights.mV[VW] == vweights.mV[VX])
				{
					m_lerps.xw = m_lerps.xx;
					m_lerps.yw = m_lerps.yx;
					m_lerps.zw = m_lerps.zx;
					m_lerps.ww = m_lerps.wx;
				}
				else
				if(vweights.mV[VW] == vweights.mV[VY])
				{
					m_lerps.xw = m_lerps.xy;
					m_lerps.yw = m_lerps.yy;
					m_lerps.zw = m_lerps.zy;
					m_lerps.ww = m_lerps.wy;
				}
				else
				if(vweights.mV[VW] == vweights.mV[VZ])
				{
					m_lerps.xw = m_lerps.xz;
					m_lerps.yw = m_lerps.yz;
					m_lerps.zw = m_lerps.zz;
					m_lerps.ww = m_lerps.wz;
				}
				else
				{
					lerp_bits++;
					V4F32 vi = vweights.v;
					vi = mmShuffle(vi,vi,3,3,3,3);
					S32 joint = _mm_cvttss_si32(vi);
					vi =  _mm_cvtsi32_ss(vi, joint);
					vi = mmShuffle(vi,vi,0,0,0,0);
					vi = _mm_sub_ps(vweights.v, vi);
					vi = mmShuffle(vi,vi,3,3,3,3);
					LLV4Matrix4* mat = &sJointMat[joint];
					m_lerps.xw = llv4lerp(mat->mV[VX], (mat+1)->mV[VX], vi);
					m_lerps.yw = llv4lerp(mat->mV[VY], (mat+1)->mV[VY], vi);
					m_lerps.zw = llv4lerp(mat->mV[VZ], (mat+1)->mV[VZ], vi);
					m_lerps.ww = llv4lerp(mat->mV[VW], (mat+1)->mV[VW], vi);
				}
				m_lerps.packRows();
				xweights.v = vweights.v;
			}

			m_verts.wx = _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(m_verts.xx, m_lerps.xx), _mm_mul_ps(m_verts.yx, m_lerps.yx)), _mm_mul_ps(m_verts.zx, m_lerps.zx)),m_lerps.wx);
			m_verts.wy = _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(m_verts.xy, m_lerps.xy), _mm_mul_ps(m_verts.yy, m_lerps.yy)), _mm_mul_ps(m_verts.zy, m_lerps.zy)),m_lerps.wy);
			m_verts.wz = _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(m_verts.xz, m_lerps.xz), _mm_mul_ps(m_verts.yz, m_lerps.yz)), _mm_mul_ps(m_verts.zz, m_lerps.zz)),m_lerps.wz);

			m_norms.wx = _mm_add_ps(_mm_add_ps(_mm_mul_ps(m_norms.xx, m_lerps.xx), _mm_mul_ps(m_norms.yx, m_lerps.yx)), _mm_mul_ps(m_norms.zx, m_lerps.zx));
			m_norms.wy = _mm_add_ps(_mm_add_ps(_mm_mul_ps(m_norms.xy, m_lerps.xy), _mm_mul_ps(m_norms.yy, m_lerps.yy)), _mm_mul_ps(m_norms.zy, m_lerps.zy));
			m_norms.wz = _mm_add_ps(_mm_add_ps(_mm_mul_ps(m_norms.xz, m_lerps.xz), _mm_mul_ps(m_norms.yz, m_lerps.yz)), _mm_mul_ps(m_norms.zz, m_lerps.zz));


/*
			((vertex_buffer*)((U8*)vb+stride*0))->vertex.setVec(&m_verts.mRow[VW][0]);
			((vertex_buffer*)((U8*)vb+stride*1))->vertex.setVec(&m_verts.mRow[VW][3]);
			((vertex_buffer*)((U8*)vb+stride*2))->vertex.setVec(&m_verts.mRow[VW][6]);
			((vertex_buffer*)((U8*)vb+stride*3))->vertex.setVec(&m_verts.mRow[VW][9]);

			((vertex_buffer*)((U8*)vb+stride*0))->normal.setVec(&m_norms.mRow[VW][0]);
			((vertex_buffer*)((U8*)vb+stride*1))->normal.setVec(&m_norms.mRow[VW][3]);
			((vertex_buffer*)((U8*)vb+stride*2))->normal.setVec(&m_norms.mRow[VW][6]);
			((vertex_buffer*)((U8*)vb+stride*3))->normal.setVec(&m_norms.mRow[VW][9]);
*/


			// interleave
			{
				V4F32 s;							// v1  v2  v3  v4
				m_verts.unpackRowW();				// n1  n2  n3  n4
				m_norms.unpackRowW();				//       ::
				s = m_verts.wy;						// v1  n1  v3  n3
				m_verts.wy = m_norms.wx;			// v2  n2  v4  n4
				m_norms.wx = s;
				s = m_verts.ww;
				m_verts.ww = m_norms.wz;
				m_norms.wz = s;
				m_verts.packRowW();
				m_norms.packRowW();
			}

			_mm_storeu_ps(        ((float*)((U8*)vb+stride*0)+0), m_verts.wx); // v1x v1y v1z n1x
			_mm_storel_pi((__m64*)((float*)((U8*)vb+stride*0)+4), m_verts.wy); // n1y n1z

			_mm_storeu_ps(        ((float*)((U8*)vb+stride*1)+0), m_norms.wx); // v2x v2y v2z n2x
			_mm_storel_pi((__m64*)((float*)((U8*)vb+stride*1)+4), m_norms.wy); // n2y n2z

			_mm_storeh_pi((__m64*)((float*)((U8*)vb+stride*2)+0), m_verts.wy); // v3x v3y
			_mm_storeu_ps(        ((float*)((U8*)vb+stride*2)+2), m_verts.wz); // v3z n3x n3y n3x

			_mm_storeh_pi((__m64*)((float*)((U8*)vb+stride*3)+0), m_norms.wy); // v4x v4y
			_mm_storeu_ps(        ((float*)((U8*)vb+stride*3)+2), m_norms.wz); // v4z n4x n4y n4z


/*
			_mm_stream_pi(((__m64*)((U8*)vb+stride*0))+0, m_verts.m64Row[VW][0]);
			_mm_stream_pi(((__m64*)((U8*)vb+stride*0))+1, m_verts.m64Row[VW][1]);
			_mm_stream_pi(((__m64*)((U8*)vb+stride*0))+2, m_verts.m64Row[VW][2]);

			_mm_stream_pi(((__m64*)((U8*)vb+stride*1))+0, m_norms.m64Row[VW][0]);
			_mm_stream_pi(((__m64*)((U8*)vb+stride*1))+1, m_norms.m64Row[VW][1]);
			_mm_stream_pi(((__m64*)((U8*)vb+stride*1))+2, m_norms.m64Row[VW][2]);

			_mm_stream_pi(((__m64*)((U8*)vb+stride*2))+0, m_verts.m64Row[VW][3]);
			_mm_stream_pi(((__m64*)((U8*)vb+stride*2))+1, m_verts.m64Row[VW][4]);
			_mm_stream_pi(((__m64*)((U8*)vb+stride*2))+2, m_verts.m64Row[VW][5]);

			_mm_stream_pi(((__m64*)((U8*)vb+stride*3))+0, m_norms.m64Row[VW][3]);
			_mm_stream_pi(((__m64*)((U8*)vb+stride*3))+1, m_norms.m64Row[VW][4]);
			_mm_stream_pi(((__m64*)((U8*)vb+stride*3))+2, m_norms.m64Row[VW][5]);
*/
		}

	index = (index_end/4)*4;
		LLV4Matrix4			blend_mat;
		F32 weight = F32_MAX;
		index_end = mesh->getNumVertices();
		for (; index < index_end; ++index, ++coords, ++normals, ++weights, vb = (vertex_buffer*)((U8*)vb+stride))
		{
			if( weight != *weights)
			{
				S32 joint = S32(weight = *weights);
				blend_mat.lerp(sJointMat[joint], sJointMat[joint+1], weight - joint);
			}
			blend_mat.multiply(*coords, vb->vertex);
			((LLV4Matrix3)blend_mat).multiply(*normals, vb->normal);
		}
	}
	//_mm_sfence();
	//_mm_empty();
}

#else

void update_geometry_sse2(LLFace *face, LLPolyMesh *mesh)
{
	update_geometry_vec(face, mesh);
	return;
}

#endif



More information about the SLDev mailing list