From 63ef1b3ebfe3dd46122663b7e23ec93078d5dc43 Mon Sep 17 00:00:00 2001 From: Thorsten Liebig Date: Sat, 1 May 2010 12:57:43 +0200 Subject: [PATCH] sse engine: changed the vector order to reduce copy operations --- FDTD/engine_sse.cpp | 75 ++++++++++++++++++++++++++++++------------- FDTD/engine_sse.h | 6 ++-- FDTD/operator_sse.cpp | 2 ++ FDTD/operator_sse.h | 10 +++--- tools/array_ops.cpp | 2 +- 5 files changed, 66 insertions(+), 29 deletions(-) diff --git a/FDTD/engine_sse.cpp b/FDTD/engine_sse.cpp index cb69548..0531d51 100644 --- a/FDTD/engine_sse.cpp +++ b/FDTD/engine_sse.cpp @@ -33,6 +33,7 @@ Engine_sse::Engine_sse(const Operator_sse* op) : Engine(op) { numLines[n] = Op->GetNumberOfLines(n); } + numVectors = ceil((double)numLines[2]/4.0); } Engine_sse::~Engine_sse() @@ -63,34 +64,49 @@ void Engine_sse::UpdateVoltages() bool shift[2]; f4vector temp; + unsigned int maxZ = numVectors; + for (pos[0]=0;pos[0]f4_vv[0][pos[0]][pos[1]][pos[2]].v; - f4_volt[0][pos[0]][pos[1]][pos[2]].v += Op->f4_vi[0][pos[0]][pos[1]][pos[2]].v * ( f4_curr[2][pos[0]][pos[1]][pos[2]].v - f4_curr[2][pos[0]][pos[1]-shift[1]][pos[2]].v - f4_curr[1][pos[0]][pos[1]][pos[2]].v + temp.v ); + f4_volt[0][pos[0]][pos[1]][pos[2]].v += Op->f4_vi[0][pos[0]][pos[1]][pos[2]].v * ( f4_curr[2][pos[0]][pos[1]][pos[2]].v - f4_curr[2][pos[0]][pos[1]-shift[1]][pos[2]].v - f4_curr[1][pos[0]][pos[1]][pos[2]].v + f4_curr[1][pos[0]][pos[1]][pos[2]-1].v ); // y-polarization - temp.f[0] = f4_curr[0][pos[0]][pos[1]][pos[2]-(bool)pos[2]].f[3]; - temp.f[1] = f4_curr[0][pos[0]][pos[1]][pos[2]].f[0]; - temp.f[2] = f4_curr[0][pos[0]][pos[1]][pos[2]].f[1]; - temp.f[3] = f4_curr[0][pos[0]][pos[1]][pos[2]].f[2]; f4_volt[1][pos[0]][pos[1]][pos[2]].v *= Op->f4_vv[1][pos[0]][pos[1]][pos[2]].v; - f4_volt[1][pos[0]][pos[1]][pos[2]].v += Op->f4_vi[1][pos[0]][pos[1]][pos[2]].v * ( f4_curr[0][pos[0]][pos[1]][pos[2]].v - temp.v - f4_curr[2][pos[0]][pos[1]][pos[2]].v + f4_curr[2][pos[0]-shift[0]][pos[1]][pos[2]].v); + f4_volt[1][pos[0]][pos[1]][pos[2]].v += Op->f4_vi[1][pos[0]][pos[1]][pos[2]].v * ( f4_curr[0][pos[0]][pos[1]][pos[2]].v - f4_curr[0][pos[0]][pos[1]][pos[2]-1].v - f4_curr[2][pos[0]][pos[1]][pos[2]].v + f4_curr[2][pos[0]-shift[0]][pos[1]][pos[2]].v); // z-polarization f4_volt[2][pos[0]][pos[1]][pos[2]].v *= Op->f4_vv[2][pos[0]][pos[1]][pos[2]].v; f4_volt[2][pos[0]][pos[1]][pos[2]].v += Op->f4_vi[2][pos[0]][pos[1]][pos[2]].v * ( f4_curr[1][pos[0]][pos[1]][pos[2]].v - f4_curr[1][pos[0]-shift[0]][pos[1]][pos[2]].v - f4_curr[0][pos[0]][pos[1]][pos[2]].v + f4_curr[0][pos[0]][pos[1]-shift[1]][pos[2]].v); } + + // for pos[2] = 0 + // x-polarization + temp.f[0] = 0; + temp.f[1] = f4_curr[1][pos[0]][pos[1]][maxZ-1].f[0]; + temp.f[2] = f4_curr[1][pos[0]][pos[1]][maxZ-1].f[1]; + temp.f[3] = f4_curr[1][pos[0]][pos[1]][maxZ-1].f[2]; + f4_volt[0][pos[0]][pos[1]][0].v *= Op->f4_vv[0][pos[0]][pos[1]][0].v; + f4_volt[0][pos[0]][pos[1]][0].v += Op->f4_vi[0][pos[0]][pos[1]][0].v * ( f4_curr[2][pos[0]][pos[1]][0].v - f4_curr[2][pos[0]][pos[1]-shift[1]][0].v - f4_curr[1][pos[0]][pos[1]][0].v + temp.v ); + + // y-polarization + temp.f[0] = 0; + temp.f[1] = f4_curr[0][pos[0]][pos[1]][maxZ-1].f[0]; + temp.f[2] = f4_curr[0][pos[0]][pos[1]][maxZ-1].f[1]; + temp.f[3] = f4_curr[0][pos[0]][pos[1]][maxZ-1].f[2]; + f4_volt[1][pos[0]][pos[1]][0].v *= Op->f4_vv[1][pos[0]][pos[1]][0].v; + f4_volt[1][pos[0]][pos[1]][0].v += Op->f4_vi[1][pos[0]][pos[1]][0].v * ( f4_curr[0][pos[0]][pos[1]][0].v - temp.v - f4_curr[2][pos[0]][pos[1]][0].v + f4_curr[2][pos[0]-shift[0]][pos[1]][0].v); + + // z-polarization + f4_volt[2][pos[0]][pos[1]][0].v *= Op->f4_vv[2][pos[0]][pos[1]][0].v; + f4_volt[2][pos[0]][pos[1]][0].v += Op->f4_vi[2][pos[0]][pos[1]][0].v * ( f4_curr[1][pos[0]][pos[1]][0].v - f4_curr[1][pos[0]-shift[0]][pos[1]][0].v - f4_curr[0][pos[0]][pos[1]][0].v + f4_curr[0][pos[0]][pos[1]-shift[1]][0].v); } } } @@ -100,32 +116,47 @@ void Engine_sse::UpdateCurrents() unsigned int pos[5]; f4vector temp; + unsigned int maxZ = numVectors; + for (pos[0]=0;pos[0]f4_ii[0][pos[0]][pos[1]][pos[2]].v; - f4_curr[0][pos[0]][pos[1]][pos[2]].v += Op->f4_iv[0][pos[0]][pos[1]][pos[2]].v * ( f4_volt[2][pos[0]][pos[1]][pos[2]].v - f4_volt[2][pos[0]][pos[1]+1][pos[2]].v - f4_volt[1][pos[0]][pos[1]][pos[2]].v + temp.v); + f4_curr[0][pos[0]][pos[1]][pos[2]].v += Op->f4_iv[0][pos[0]][pos[1]][pos[2]].v * ( f4_volt[2][pos[0]][pos[1]][pos[2]].v - f4_volt[2][pos[0]][pos[1]+1][pos[2]].v - f4_volt[1][pos[0]][pos[1]][pos[2]].v + f4_volt[1][pos[0]][pos[1]][pos[2]+1].v); // y-pol - temp.f[0] = f4_volt[0][pos[0]][pos[1]][pos[2]].f[1]; - temp.f[1] = f4_volt[0][pos[0]][pos[1]][pos[2]].f[2]; - temp.f[2] = f4_volt[0][pos[0]][pos[1]][pos[2]].f[3]; - temp.f[3] = f4_volt[0][pos[0]][pos[1]][pos[2]+1].f[0]; // FIXME outside sim area f4_curr[1][pos[0]][pos[1]][pos[2]].v *= Op->f4_ii[1][pos[0]][pos[1]][pos[2]].v; - f4_curr[1][pos[0]][pos[1]][pos[2]].v += Op->f4_iv[1][pos[0]][pos[1]][pos[2]].v * ( f4_volt[0][pos[0]][pos[1]][pos[2]].v - temp.v - f4_volt[2][pos[0]][pos[1]][pos[2]].v + f4_volt[2][pos[0]+1][pos[1]][pos[2]].v); + f4_curr[1][pos[0]][pos[1]][pos[2]].v += Op->f4_iv[1][pos[0]][pos[1]][pos[2]].v * ( f4_volt[0][pos[0]][pos[1]][pos[2]].v - f4_volt[0][pos[0]][pos[1]][pos[2]+1].v - f4_volt[2][pos[0]][pos[1]][pos[2]].v + f4_volt[2][pos[0]+1][pos[1]][pos[2]].v); // z-pol f4_curr[2][pos[0]][pos[1]][pos[2]].v *= Op->f4_ii[2][pos[0]][pos[1]][pos[2]].v; f4_curr[2][pos[0]][pos[1]][pos[2]].v += Op->f4_iv[2][pos[0]][pos[1]][pos[2]].v * ( f4_volt[1][pos[0]][pos[1]][pos[2]].v - f4_volt[1][pos[0]+1][pos[1]][pos[2]].v - f4_volt[0][pos[0]][pos[1]][pos[2]].v + f4_volt[0][pos[0]][pos[1]+1][pos[2]].v); } + + // for pos[2] = maxZ-1 + // x-pol + temp.f[0] = f4_volt[1][pos[0]][pos[1]][0].f[1]; + temp.f[1] = f4_volt[1][pos[0]][pos[1]][0].f[2]; + temp.f[2] = f4_volt[1][pos[0]][pos[1]][0].f[3]; + temp.f[3] = 0; + f4_curr[0][pos[0]][pos[1]][maxZ-1].v *= Op->f4_ii[0][pos[0]][pos[1]][maxZ-1].v; + f4_curr[0][pos[0]][pos[1]][maxZ-1].v += Op->f4_iv[0][pos[0]][pos[1]][maxZ-1].v * ( f4_volt[2][pos[0]][pos[1]][maxZ-1].v - f4_volt[2][pos[0]][pos[1]+1][maxZ-1].v - f4_volt[1][pos[0]][pos[1]][maxZ-1].v + temp.v); + + // y-pol + temp.f[0] = f4_volt[0][pos[0]][pos[1]][0].f[1]; + temp.f[1] = f4_volt[0][pos[0]][pos[1]][0].f[2]; + temp.f[2] = f4_volt[0][pos[0]][pos[1]][0].f[3]; + temp.f[3] = 0; + f4_curr[1][pos[0]][pos[1]][maxZ-1].v *= Op->f4_ii[1][pos[0]][pos[1]][maxZ-1].v; + f4_curr[1][pos[0]][pos[1]][maxZ-1].v += Op->f4_iv[1][pos[0]][pos[1]][maxZ-1].v * ( f4_volt[0][pos[0]][pos[1]][maxZ-1].v - temp.v - f4_volt[2][pos[0]][pos[1]][maxZ-1].v + f4_volt[2][pos[0]+1][pos[1]][maxZ-1].v); + + // z-pol + f4_curr[2][pos[0]][pos[1]][maxZ-1].v *= Op->f4_ii[2][pos[0]][pos[1]][maxZ-1].v; + f4_curr[2][pos[0]][pos[1]][maxZ-1].v += Op->f4_iv[2][pos[0]][pos[1]][maxZ-1].v * ( f4_volt[1][pos[0]][pos[1]][maxZ-1].v - f4_volt[1][pos[0]+1][pos[1]][maxZ-1].v - f4_volt[0][pos[0]][pos[1]][maxZ-1].v + f4_volt[0][pos[0]][pos[1]+1][maxZ-1].v); } } } diff --git a/FDTD/engine_sse.h b/FDTD/engine_sse.h index c9cbdaf..ed92a7f 100644 --- a/FDTD/engine_sse.h +++ b/FDTD/engine_sse.h @@ -32,8 +32,8 @@ public: virtual unsigned int GetNumberOfTimesteps() {return numTS;}; - inline virtual FDTD_FLOAT& GetVolt( unsigned int n, unsigned int x, unsigned int y, unsigned int z ) const { return f4_volt[n][x][y][z/4].f[z%4]; } - inline virtual FDTD_FLOAT& GetCurr( unsigned int n, unsigned int x, unsigned int y, unsigned int z ) const { return f4_curr[n][x][y][z/4].f[z%4]; } + inline virtual FDTD_FLOAT& GetVolt( unsigned int n, unsigned int x, unsigned int y, unsigned int z ) const { return f4_volt[n][x][y][z%numVectors].f[z/numVectors]; } + inline virtual FDTD_FLOAT& GetCurr( unsigned int n, unsigned int x, unsigned int y, unsigned int z ) const { return f4_curr[n][x][y][z%numVectors].f[z/numVectors]; } protected: Engine_sse(const Operator_sse* op); @@ -42,6 +42,8 @@ protected: virtual void UpdateVoltages(); virtual void UpdateCurrents(); + unsigned int numVectors; + f4vector**** f4_volt; f4vector**** f4_curr; }; diff --git a/FDTD/operator_sse.cpp b/FDTD/operator_sse.cpp index 23f7d73..1d097a1 100644 --- a/FDTD/operator_sse.cpp +++ b/FDTD/operator_sse.cpp @@ -63,5 +63,7 @@ void Operator_sse::InitOperator() f4_vi = Create_N_3DArray_v4sf(numLines); f4_iv = Create_N_3DArray_v4sf(numLines); f4_ii = Create_N_3DArray_v4sf(numLines); + + numVectors = ceil((double)numLines[2]/4.0); } diff --git a/FDTD/operator_sse.h b/FDTD/operator_sse.h index 522e4b8..7d04ad8 100644 --- a/FDTD/operator_sse.h +++ b/FDTD/operator_sse.h @@ -28,11 +28,11 @@ public: static Operator_sse* New(); virtual ~Operator_sse(); - inline virtual FDTD_FLOAT& GetVV( unsigned int n, unsigned int x, unsigned int y, unsigned int z ) const { return f4_vv[n][x][y][z/4].f[z%4]; } - inline virtual FDTD_FLOAT& GetVI( unsigned int n, unsigned int x, unsigned int y, unsigned int z ) const { return f4_vi[n][x][y][z/4].f[z%4]; } + inline virtual FDTD_FLOAT& GetVV( unsigned int n, unsigned int x, unsigned int y, unsigned int z ) const { return f4_vv[n][x][y][z%numVectors].f[z/numVectors]; } + inline virtual FDTD_FLOAT& GetVI( unsigned int n, unsigned int x, unsigned int y, unsigned int z ) const { return f4_vi[n][x][y][z%numVectors].f[z/numVectors]; } - inline virtual FDTD_FLOAT& GetII( unsigned int n, unsigned int x, unsigned int y, unsigned int z ) const { return f4_ii[n][x][y][z/4].f[z%4]; } - inline virtual FDTD_FLOAT& GetIV( unsigned int n, unsigned int x, unsigned int y, unsigned int z ) const { return f4_iv[n][x][y][z/4].f[z%4]; } + inline virtual FDTD_FLOAT& GetII( unsigned int n, unsigned int x, unsigned int y, unsigned int z ) const { return f4_ii[n][x][y][z%numVectors].f[z/numVectors]; } + inline virtual FDTD_FLOAT& GetIV( unsigned int n, unsigned int x, unsigned int y, unsigned int z ) const { return f4_iv[n][x][y][z%numVectors].f[z/numVectors]; } protected: //! use New() for creating a new Operator @@ -42,6 +42,8 @@ protected: virtual void Reset(); virtual void InitOperator(); + unsigned int numVectors; + // engine/post-proc needs access public: f4vector**** f4_vv; //calc new voltage from old voltage diff --git a/tools/array_ops.cpp b/tools/array_ops.cpp index b326e01..01c9c48 100644 --- a/tools/array_ops.cpp +++ b/tools/array_ops.cpp @@ -149,7 +149,7 @@ void Delete_N_3DArray_v4sf(f4vector**** array, const unsigned int* numLines) f4vector*** Create3DArray_v4sf(const unsigned int* numLines) { - unsigned int numZ = ceil(numLines[2]/4)+1; + unsigned int numZ = ceil((double)numLines[2]/4.0); f4vector*** array=NULL; unsigned int pos[3];