[sldev] llviewerjointmesh_sse2.cpp

Dirk Moerenhout blakar at gmail.com
Mon Jul 9 11:16:23 PDT 2007


> It doesn't use globals. [See VWR-1610]

What do you call these?

U32 lerp_comps = 0;
U32 vert_comps = 0;
U32 lerp_bits = 0;

I fear what was reported means that the compiler will use SSE to
initialise faster. E.g. by using SSE moves. So any global may be a
danger. Off course it could be that it is limited to class
constructors and then it's no issue.

Dirk aka Blakar Ogre

>
> Note, this was tested with the older version of the viewer, may need to
> adjust something. (then again...may not.)
>
> =)
>
> --
> Power to Change the Void
>
> /**
>  * @file llviewerjointmesh_sse2.cpp
>  * @brief LLV4 class implementation with LLViewerJointMesh class
>  *
>  * Copyright (c) 2007, Linden Research, Inc.
>  * License: GPLv3 -- http://www.gnu.org/copyleft/gpl.html
>  */
>
> //-----------------------------------------------------------------------------
> // Header Files
> //-----------------------------------------------------------------------------
>
> // Do not use precompiled headers, because we need to build this file with
> // SSE support, but not the precompiled header file. JC
> #include "linden_common.h"
>
> #include "llviewerjointmesh.h"
>
> // project includes
> #include "llface.h"
> #include "llpolymesh.h"
>
> // library includes
> #include "lldarray.h"
> #include "llstrider.h"
> #include "llv4matrix4.h"
> #include "m4math.h"
> #include "v3math.h"
>
> // *NOTE: SSE2 must be enabled for this module
>
> #if LL_VECTORIZE
>
> #include "emmintrin.h"
>
>
> #if LL_MSVC
> #pragma warning( disable     : 4701 )   // "potentially uninitialized local variable"  -- disabled
> #endif
>
> inline V4F32 llv4lerp(V4F32 a, V4F32 b, V4F32 w)
> {
>         return _mm_add_ps(a, _mm_mul_ps(_mm_sub_ps(b, a), w)); // a + ( b - a ) * w
> }
>
> //-----------------------------------------------------------------------
> //-----------------------------------------------------------------------
> // LLV4Matrix3dx4
> //-----------------------------------------------------------------------
> //-----------------------------------------------------------------------
>
>
> U32 lerp_comps = 0;
> U32 vert_comps = 0;
> U32 lerp_bits = 0;
>
>
> // NOTE: LLV4Matrix3dx4 is specially designed to swizzle XYZ data streams.
>
> LL_LLV4MATH_ALIGN_PREFIX
>
> class LLV4Matrix3dx4
> {
> public:
>         union
>         {
>                 F32                     mArray[4*4*4];
>                 F32                     mRow[4][4*4];
>                 __m64           mRow_i64[4][4*2];
>                 __m128i         mRow_i128[4][4];
>                 struct
>                 {
>                         V4F32   xx, xy, xz, xw;
>                         V4F32   yx, yy, yz, yw;
>                         V4F32   zx, zy, zz, zw;
>                         V4F32   wx, wy, wz, ww;
>                 };
>         };
>
>         void                    load4VectorsAndTranspose(const LLVector3* a);
>         void                    packRowX();
>         void                    packRowY();
>         void                    packRowZ();
>         void                    packRowW();
>         void                    packRows();
>         void                    unpackRowW();
> }
>
> LL_LLV4MATH_ALIGN_POSTFIX;
>
>
> #define mmShuffle(d,s,d1,d2,s3,s4)      _mm_shuffle_ps(d,s,_MM_SHUFFLE(s4,s3,d2,d1))
>
> inline void LLV4Matrix3dx4::load4VectorsAndTranspose(const LLVector3* a)
> {
>         const V4F32 sx = _mm_loadu_ps((float*)a+0);
>         const V4F32 sy = _mm_loadu_ps((float*)a+4);
>         const V4F32 sz = _mm_loadu_ps((float*)a+8);
>                                                                         // {sx,sy,sz} = { x1 y1 z1 x2, y2 z2 x3 y3, z3 x4 y4 z4 }
>         xx = mmShuffle(sx,sx,0,0,0,3);  // xx = { x1 x1 x1 x2 }
>         xy = mmShuffle(xx,sy,3,3,2,2);  // xy = { x2 x2 x3 x3 }
>         xz = mmShuffle(xy,sz,2,2,1,1);  // xz = { x3 x4 x4 x4 }
>         xz = mmShuffle(xz,xz,0,2,2,3);
>
>         yx = mmShuffle(sx,sy,1,1,0,0);  // yx = { y1 y1 y1 y2 }
>         yx = mmShuffle(yx,yx,0,1,1,3);
>         yy = mmShuffle(yx,sy,3,3,3,3);  // yy = { y2 y2 y3 y3 }
>         yz = mmShuffle(yy,sz,3,3,2,2);  // yz = { y3 y4 y4 y4 }
>         yz = mmShuffle(yz,yz,0,2,2,3);
>
>         zx = mmShuffle(sx,sy,2,2,1,1);  // zx = { z1 z1 z1 z2 }
>         zx = mmShuffle(zx,zx,0,1,1,3);
>         zy = mmShuffle(zx,sz,3,3,0,0);  // zy = { z2 z2 z3 z3 }
>         zz = mmShuffle(sz,sz,0,3,3,3);  // zz = { z3 z4 z4 z4 }
> }
>
> inline void LLV4Matrix3dx4::packRowX()
> {
>         V4F32 vt;
>         vt = mmShuffle(xy,xx,0,1,2,3);
>         xx = mmShuffle(xx,vt,0,1,2,0);
>         xy = mmShuffle(xy,xz,1,2,0,1);
>         vt = mmShuffle(xw,xz,0,3,2,2);
>         xz = mmShuffle(vt,xw,2,0,1,2);
> }
>
> inline void LLV4Matrix3dx4::packRowY()
> {
>         V4F32 vt;
>         vt = mmShuffle(yy,yx,0,1,2,3);
>         yx = mmShuffle(yx,vt,0,1,2,0);
>         yy = mmShuffle(yy,yz,1,2,0,1);
>         vt = mmShuffle(yw,yz,0,3,2,2);
>         yz = mmShuffle(vt,yw,2,0,1,2);
> }
>
> inline void LLV4Matrix3dx4::packRowZ()
> {
>         V4F32 vt;
>         vt = mmShuffle(zy,zx,0,1,2,3);
>         zx = mmShuffle(zx,vt,0,1,2,0);
>         zy = mmShuffle(zy,zz,1,2,0,1);
>         vt = mmShuffle(zw,zz,0,3,2,2);
>         zz = mmShuffle(vt,zw,2,0,1,2);
> }
>
> inline void LLV4Matrix3dx4::packRowW()
> {
>         V4F32 vt;
>         vt = mmShuffle(wy,wx,0,1,2,3);
>         wx = mmShuffle(wx,vt,0,1,2,0);
>         wy = mmShuffle(wy,wz,1,2,0,1);
>         vt = mmShuffle(ww,wz,0,3,2,2);
>         wz = mmShuffle(vt,ww,2,0,1,2);
> }
>
> inline void LLV4Matrix3dx4::packRows()
> {
>         packRowX();
>         packRowY();
>         packRowZ();
>         packRowW();
> }
>
> inline void LLV4Matrix3dx4::unpackRowW()
> {
>         ww = mmShuffle(wz,wz,1,2,3,0);
>         wz = mmShuffle(wy,wz,2,3,0,0);
>         wy = mmShuffle(wy,wx,0,1,3,0);
>         wy = mmShuffle(wy,wy,2,0,1,0);
> }
>
>
> //-----------------------------------------------------------------------
> //-----------------------------------------------------------------------
> // LLV4Matrix3dx4
> //-----------------------------------------------------------------------
> //-----------------------------------------------------------------------
>
> //---
>
> //-----------------------------------------------------------------------
> //-----------------------------------------------------------------------
> // LLViewerJointMesh::updateGeometry()
> //-----------------------------------------------------------------------
> //-----------------------------------------------------------------------
>
> void update_geometry_sse2(LLFace *face, LLPolyMesh *mesh)
> {
>         LLV4Matrix4             sJointMat[32];
>
>         //upload joint pivots/matrices
>         {
>                 LLDynamicArray<LLJointRenderData*>& joint_data = mesh->getReferenceMesh()->mJointRenderData;
>                 LLDynamicArray<LLJointRenderData*>::const_iterator i = joint_data.begin(), iend = joint_data.end();
>                 LLV4Matrix4*            mat             = &sJointMat[0];
>                 LLJointRenderData*      jd              = *i;
>                 register V4F32          vw;
>                 const LLMatrix4*        w;
>                 const LLVector3*        v;
>                 for(; jd; ++mat)
>                 {
>                         w = jd->mWorldMatrix;
>                         if(jd->mSkinJoint)
>                         {
>                                 v = &jd->mSkinJoint->mRootToJointSkinOffset;
>                                 jd = ++i < iend ? *i : NULL;
>                         }
>                         else
>                         {
>                                 v = &(jd = *++i)->mSkinJoint->mRootToParentJointSkinOffset;
>                         }
>                         if((unsigned)w & 0xF)
>                         {
>                                 vw = _mm_loadu_ps(w->mMatrix[VW]);
>                                 vw                      = _mm_add_ps(vw, _mm_mul_ps(_mm_set1_ps(v->mV[VX]), mat->mV[VX] = _mm_loadu_ps(w->mMatrix[VX])));
>                                 vw                      = _mm_add_ps(vw, _mm_mul_ps(_mm_set1_ps(v->mV[VY]), mat->mV[VY] = _mm_loadu_ps(w->mMatrix[VY])));
>                                 mat->mV[VW] = _mm_add_ps(vw, _mm_mul_ps(_mm_set1_ps(v->mV[VZ]), mat->mV[VZ] = _mm_loadu_ps(w->mMatrix[VZ])));
>                         }
>                         else
>                         {
>                                 vw = _mm_load_ps(w->mMatrix[VW]);
>                                 vw                      = _mm_add_ps(vw, _mm_mul_ps(_mm_set1_ps(v->mV[VX]), mat->mV[VX] = _mm_load_ps(w->mMatrix[VX])));
>                                 vw                      = _mm_add_ps(vw, _mm_mul_ps(_mm_set1_ps(v->mV[VY]), mat->mV[VY] = _mm_load_ps(w->mMatrix[VY])));
>                                 mat->mV[VW] = _mm_add_ps(vw, _mm_mul_ps(_mm_set1_ps(v->mV[VZ]), mat->mV[VZ] = _mm_load_ps(w->mMatrix[VZ])));
>                         }
>                 }
>         }
>
>         //update geometry
>         {
>                 LLV4Matrix3dx4  m_verts;
>                 LLV4Matrix3dx4  m_norms;
>                 LLV4Matrix3dx4  m_lerps;
>
>                 struct vertex_buffer            // *NOTE: relies on a specific vertex buffer structure
>                 {
>                         LLVector3               vertex;
>                         LLVector3               normal;
>                 };
>
>                 LL_LLV4MATH_ALIGN_PREFIX
>                 union                                           // aligned stack for manual loop vectorization
>                 {
>                         __m128i                                 vp;
>                         struct
>                         {
>                                 vertex_buffer*          vb;
>                                 const F32*                      weights;
>                                 const LLVector3*        coords;
>                                 const LLVector3*        normals;
>                         };
>                 }
>                 LL_LLV4MATH_ALIGN_POSTFIX;
>
>                 U32                             stride                  = (U32)face->mVertexBuffer->getStride();
>
>                 vb                              = (vertex_buffer*)((U8*)face->mVertexBuffer->getMappedData() + stride * mesh->mFaceVertexOffset);
>                 weights                 = mesh->getWeights();
>                 coords                  = mesh->getCoords();
>                 normals                 = mesh->getNormals();
>
>                 LLV4Vector3             vweights;
>                 LLV4Vector3             xweights;
>                 xweights.v              = _mm_set1_ps(F32_MAX);
>
> vert_comps += mesh->getNumVertices();
>
>                 U32 index = 0, index_end = mesh->getNumVertices();
>
>                 __m128i vsz = _mm_setr_epi32(stride, sizeof(F32), sizeof(LLVector3), sizeof(LLVector3));
>                 vsz = _mm_add_epi32(vsz,vsz);
>                 vsz = _mm_add_epi32(vsz,vsz);
>                 for(index = index_end/4+1; --index; vp = _mm_add_epi32(vp, vsz))
>
> //              index_end -= 4;
> //              for(; index < index_end; index+=4, coords+=4, normals+=4, weights+=4, vb = (vertex_buffer*)((U8*)vb+stride*4))
>                 {
>                         m_verts.load4VectorsAndTranspose(coords);
>                         m_norms.load4VectorsAndTranspose(normals);
>                         vweights.v = _mm_loadu_ps(weights);
>                         _mm_prefetch((char*)(coords+4),_MM_HINT_NTA);
>                         _mm_prefetch((char*)(coords+5),_MM_HINT_NTA);
>                         _mm_prefetch((char*)(coords+6),_MM_HINT_NTA);
>                         _mm_prefetch((char*)(normals+4),_MM_HINT_NTA);
>                         _mm_prefetch((char*)(normals+5),_MM_HINT_NTA);
>                         _mm_prefetch((char*)(normals+6),_MM_HINT_NTA);
>                         _mm_prefetch((char*)(weights+4),_MM_HINT_NTA);
>                         int r = _mm_movemask_ps(_mm_cmpeq_ps(vweights.v, xweights.v));
>                         if(r!=0xF)
>                         {
>                                 lerp_comps+=4;
>                                 if(!(r&1))
>                                 {
>                                         lerp_bits++;
>                                         V4F32 vi = vweights.v;
>                                         S32 joint = _mm_cvttss_si32(vi);
>                                         vi =  _mm_cvtsi32_ss(vi, joint);
>                                         vi = _mm_sub_ps(vweights.v, vi);
>                                         vi = mmShuffle(vi,vi,0,0,0,0);
>                                         LLV4Matrix4* mat = &sJointMat[joint];
>                                         m_lerps.xx = llv4lerp(mat->mV[VX], (mat+1)->mV[VX], vi);
>                                         m_lerps.yx = llv4lerp(mat->mV[VY], (mat+1)->mV[VY], vi);
>                                         m_lerps.zx = llv4lerp(mat->mV[VZ], (mat+1)->mV[VZ], vi);
>                                         m_lerps.wx = llv4lerp(mat->mV[VW], (mat+1)->mV[VW], vi);
>                                 }
>                                 if(vweights.mV[VY] == vweights.mV[VX])
>                                 {
>                                         m_lerps.xy = m_lerps.xx;
>                                         m_lerps.yy = m_lerps.yx;
>                                         m_lerps.zy = m_lerps.zx;
>                                         m_lerps.wy = m_lerps.wx;
>                                 }
>                                 else
>                                 {
>                                         lerp_bits++;
>                                         V4F32 vi = vweights.v;
>                                         vi = mmShuffle(vi,vi,1,1,1,1);
>                                         S32 joint = _mm_cvttss_si32(vi);
>                                         vi =  _mm_cvtsi32_ss(vi, joint);
>                                         vi = mmShuffle(vi,vi,0,0,0,0);
>                                         vi = _mm_sub_ps(vweights.v, vi);
>                                         vi = mmShuffle(vi,vi,1,1,1,1);
>                                         LLV4Matrix4* mat = &sJointMat[joint];
>                                         m_lerps.xy = llv4lerp(mat->mV[VX], (mat+1)->mV[VX], vi);
>                                         m_lerps.yy = llv4lerp(mat->mV[VY], (mat+1)->mV[VY], vi);
>                                         m_lerps.zy = llv4lerp(mat->mV[VZ], (mat+1)->mV[VZ], vi);
>                                         m_lerps.wy = llv4lerp(mat->mV[VW], (mat+1)->mV[VW], vi);
>                                 }
>                                 if(vweights.mV[VZ] == vweights.mV[VX])
>                                 {
>                                         m_lerps.xz = m_lerps.xx;
>                                         m_lerps.yz = m_lerps.yx;
>                                         m_lerps.zz = m_lerps.zx;
>                                         m_lerps.wz = m_lerps.wx;
>                                 }
>                                 else
>                                 if(vweights.mV[VZ] == vweights.mV[VY])
>                                 {
>                                         m_lerps.xz = m_lerps.xy;
>                                         m_lerps.yz = m_lerps.yy;
>                                         m_lerps.zz = m_lerps.zy;
>                                         m_lerps.wz = m_lerps.wy;
>                                 }
>                                 else
>                                 {
>                                         lerp_bits++;
>                                         V4F32 vi = vweights.v;
>                                         vi = mmShuffle(vi,vi,2,2,2,2);
>                                         S32 joint = _mm_cvttss_si32(vi);
>                                         vi =  _mm_cvtsi32_ss(vi, joint);
>                                         vi = mmShuffle(vi,vi,0,0,0,0);
>                                         vi = _mm_sub_ps(vweights.v, vi);
>                                         vi = mmShuffle(vi,vi,2,2,2,2);
>                                         LLV4Matrix4* mat = &sJointMat[joint];
>                                         m_lerps.xz = llv4lerp(mat->mV[VX], (mat+1)->mV[VX], vi);
>                                         m_lerps.yz = llv4lerp(mat->mV[VY], (mat+1)->mV[VY], vi);
>                                         m_lerps.zz = llv4lerp(mat->mV[VZ], (mat+1)->mV[VZ], vi);
>                                         m_lerps.wz = llv4lerp(mat->mV[VW], (mat+1)->mV[VW], vi);
>                                 }
>                                 if(vweights.mV[VW] == vweights.mV[VX])
>                                 {
>                                         m_lerps.xw = m_lerps.xx;
>                                         m_lerps.yw = m_lerps.yx;
>                                         m_lerps.zw = m_lerps.zx;
>                                         m_lerps.ww = m_lerps.wx;
>                                 }
>                                 else
>                                 if(vweights.mV[VW] == vweights.mV[VY])
>                                 {
>                                         m_lerps.xw = m_lerps.xy;
>                                         m_lerps.yw = m_lerps.yy;
>                                         m_lerps.zw = m_lerps.zy;
>                                         m_lerps.ww = m_lerps.wy;
>                                 }
>                                 else
>                                 if(vweights.mV[VW] == vweights.mV[VZ])
>                                 {
>                                         m_lerps.xw = m_lerps.xz;
>                                         m_lerps.yw = m_lerps.yz;
>                                         m_lerps.zw = m_lerps.zz;
>                                         m_lerps.ww = m_lerps.wz;
>                                 }
>                                 else
>                                 {
>                                         lerp_bits++;
>                                         V4F32 vi = vweights.v;
>                                         vi = mmShuffle(vi,vi,3,3,3,3);
>                                         S32 joint = _mm_cvttss_si32(vi);
>                                         vi =  _mm_cvtsi32_ss(vi, joint);
>                                         vi = mmShuffle(vi,vi,0,0,0,0);
>                                         vi = _mm_sub_ps(vweights.v, vi);
>                                         vi = mmShuffle(vi,vi,3,3,3,3);
>                                         LLV4Matrix4* mat = &sJointMat[joint];
>                                         m_lerps.xw = llv4lerp(mat->mV[VX], (mat+1)->mV[VX], vi);
>                                         m_lerps.yw = llv4lerp(mat->mV[VY], (mat+1)->mV[VY], vi);
>                                         m_lerps.zw = llv4lerp(mat->mV[VZ], (mat+1)->mV[VZ], vi);
>                                         m_lerps.ww = llv4lerp(mat->mV[VW], (mat+1)->mV[VW], vi);
>                                 }
>                                 m_lerps.packRows();
>                                 xweights.v = vweights.v;
>                         }
>
>                         m_verts.wx = _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(m_verts.xx, m_lerps.xx), _mm_mul_ps(m_verts.yx, m_lerps.yx)), _mm_mul_ps(m_verts.zx, m_lerps.zx)),m_lerps.wx);
>                         m_verts.wy = _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(m_verts.xy, m_lerps.xy), _mm_mul_ps(m_verts.yy, m_lerps.yy)), _mm_mul_ps(m_verts.zy, m_lerps.zy)),m_lerps.wy);
>                         m_verts.wz = _mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(m_verts.xz, m_lerps.xz), _mm_mul_ps(m_verts.yz, m_lerps.yz)), _mm_mul_ps(m_verts.zz, m_lerps.zz)),m_lerps.wz);
>
>                         m_norms.wx = _mm_add_ps(_mm_add_ps(_mm_mul_ps(m_norms.xx, m_lerps.xx), _mm_mul_ps(m_norms.yx, m_lerps.yx)), _mm_mul_ps(m_norms.zx, m_lerps.zx));
>                         m_norms.wy = _mm_add_ps(_mm_add_ps(_mm_mul_ps(m_norms.xy, m_lerps.xy), _mm_mul_ps(m_norms.yy, m_lerps.yy)), _mm_mul_ps(m_norms.zy, m_lerps.zy));
>                         m_norms.wz = _mm_add_ps(_mm_add_ps(_mm_mul_ps(m_norms.xz, m_lerps.xz), _mm_mul_ps(m_norms.yz, m_lerps.yz)), _mm_mul_ps(m_norms.zz, m_lerps.zz));
>
>
> /*
>                         ((vertex_buffer*)((U8*)vb+stride*0))->vertex.setVec(&m_verts.mRow[VW][0]);
>                         ((vertex_buffer*)((U8*)vb+stride*1))->vertex.setVec(&m_verts.mRow[VW][3]);
>                         ((vertex_buffer*)((U8*)vb+stride*2))->vertex.setVec(&m_verts.mRow[VW][6]);
>                         ((vertex_buffer*)((U8*)vb+stride*3))->vertex.setVec(&m_verts.mRow[VW][9]);
>
>                         ((vertex_buffer*)((U8*)vb+stride*0))->normal.setVec(&m_norms.mRow[VW][0]);
>                         ((vertex_buffer*)((U8*)vb+stride*1))->normal.setVec(&m_norms.mRow[VW][3]);
>                         ((vertex_buffer*)((U8*)vb+stride*2))->normal.setVec(&m_norms.mRow[VW][6]);
>                         ((vertex_buffer*)((U8*)vb+stride*3))->normal.setVec(&m_norms.mRow[VW][9]);
> */
>
>
>                         // interleave
>                         {
>                                 V4F32 s;                                                        // v1  v2  v3  v4
>                                 m_verts.unpackRowW();                           // n1  n2  n3  n4
>                                 m_norms.unpackRowW();                           //       ::
>                                 s = m_verts.wy;                                         // v1  n1  v3  n3
>                                 m_verts.wy = m_norms.wx;                        // v2  n2  v4  n4
>                                 m_norms.wx = s;
>                                 s = m_verts.ww;
>                                 m_verts.ww = m_norms.wz;
>                                 m_norms.wz = s;
>                                 m_verts.packRowW();
>                                 m_norms.packRowW();
>                         }
>
>                         _mm_storeu_ps(        ((float*)((U8*)vb+stride*0)+0), m_verts.wx); // v1x v1y v1z n1x
>                         _mm_storel_pi((__m64*)((float*)((U8*)vb+stride*0)+4), m_verts.wy); // n1y n1z
>
>                         _mm_storeu_ps(        ((float*)((U8*)vb+stride*1)+0), m_norms.wx); // v2x v2y v2z n2x
>                         _mm_storel_pi((__m64*)((float*)((U8*)vb+stride*1)+4), m_norms.wy); // n2y n2z
>
>                         _mm_storeh_pi((__m64*)((float*)((U8*)vb+stride*2)+0), m_verts.wy); // v3x v3y
>                         _mm_storeu_ps(        ((float*)((U8*)vb+stride*2)+2), m_verts.wz); // v3z n3x n3y n3x
>
>                         _mm_storeh_pi((__m64*)((float*)((U8*)vb+stride*3)+0), m_norms.wy); // v4x v4y
>                         _mm_storeu_ps(        ((float*)((U8*)vb+stride*3)+2), m_norms.wz); // v4z n4x n4y n4z
>
>
> /*
>                         _mm_stream_pi(((__m64*)((U8*)vb+stride*0))+0, m_verts.m64Row[VW][0]);
>                         _mm_stream_pi(((__m64*)((U8*)vb+stride*0))+1, m_verts.m64Row[VW][1]);
>                         _mm_stream_pi(((__m64*)((U8*)vb+stride*0))+2, m_verts.m64Row[VW][2]);
>
>                         _mm_stream_pi(((__m64*)((U8*)vb+stride*1))+0, m_norms.m64Row[VW][0]);
>                         _mm_stream_pi(((__m64*)((U8*)vb+stride*1))+1, m_norms.m64Row[VW][1]);
>                         _mm_stream_pi(((__m64*)((U8*)vb+stride*1))+2, m_norms.m64Row[VW][2]);
>
>                         _mm_stream_pi(((__m64*)((U8*)vb+stride*2))+0, m_verts.m64Row[VW][3]);
>                         _mm_stream_pi(((__m64*)((U8*)vb+stride*2))+1, m_verts.m64Row[VW][4]);
>                         _mm_stream_pi(((__m64*)((U8*)vb+stride*2))+2, m_verts.m64Row[VW][5]);
>
>                         _mm_stream_pi(((__m64*)((U8*)vb+stride*3))+0, m_norms.m64Row[VW][3]);
>                         _mm_stream_pi(((__m64*)((U8*)vb+stride*3))+1, m_norms.m64Row[VW][4]);
>                         _mm_stream_pi(((__m64*)((U8*)vb+stride*3))+2, m_norms.m64Row[VW][5]);
> */
>                 }
>
>         index = (index_end/4)*4;
>                 LLV4Matrix4                     blend_mat;
>                 F32 weight = F32_MAX;
>                 index_end = mesh->getNumVertices();
>                 for (; index < index_end; ++index, ++coords, ++normals, ++weights, vb = (vertex_buffer*)((U8*)vb+stride))
>                 {
>                         if( weight != *weights)
>                         {
>                                 S32 joint = S32(weight = *weights);
>                                 blend_mat.lerp(sJointMat[joint], sJointMat[joint+1], weight - joint);
>                         }
>                         blend_mat.multiply(*coords, vb->vertex);
>                         ((LLV4Matrix3)blend_mat).multiply(*normals, vb->normal);
>                 }
>         }
>         //_mm_sfence();
>         //_mm_empty();
> }
>
> #else
>
> void update_geometry_sse2(LLFace *face, LLPolyMesh *mesh)
> {
>         update_geometry_vec(face, mesh);
>         return;
> }
>
> #endif
>
>
> _______________________________________________
> Click here to unsubscribe or manage your list subscription:
> /index.html
>
>


More information about the SLDev mailing list