openEMS/FDTD/engine_sse_compressed.cpp

/*
*	Copyright (C) 2010 Thorsten Liebig (Thorsten.Liebig@gmx.de)
*
*	This program is free software: you can redistribute it and/or modify
*	it under the terms of the GNU General Public License as published by
*	the Free Software Foundation, either version 3 of the License, or
*	(at your option) any later version.
*
*	This program is distributed in the hope that it will be useful,
*	but WITHOUT ANY WARRANTY; without even the implied warranty of
*	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*	GNU General Public License for more details.
*
*	You should have received a copy of the GNU General Public License
*	along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

#include "engine_sse_compressed.h"


Engine_SSE_Compressed* Engine_SSE_Compressed::New(const Operator_SSE_Compressed* op)
{
	cout << "Create FDTD engine (compressed SSE)" << endl;
	Engine_SSE_Compressed* e = new Engine_SSE_Compressed(op);
	e->Init();
	return e;
}

Engine_SSE_Compressed::Engine_SSE_Compressed(const Operator_SSE_Compressed* op) : Engine_sse(op)
{
	Op = op;
}

Engine_SSE_Compressed::~Engine_SSE_Compressed()
{
}

void Engine_SSE_Compressed::UpdateVoltages(unsigned int startX, unsigned int numX)
{
	unsigned int pos[3];
	bool shift[2];
	f4vector temp;

	pos[0] = startX;
	unsigned int index=0;
	for (unsigned int posX=0;posX<numX;++posX)
	{
		shift[0]=pos[0];
		for (pos[1]=0;pos[1]<numLines[1];++pos[1])
		{
			shift[1]=pos[1];
			for (pos[2]=1;pos[2]<numVectors;++pos[2])
			{
				index = Op->m_Op_index[pos[0]][pos[1]][pos[2]];
				// x-polarization
				f4_volt[0][pos[0]][pos[1]][pos[2]].v *= Op->f4_vv_Compressed[0][index].v;
				f4_volt[0][pos[0]][pos[1]][pos[2]].v += Op->f4_vi_Compressed[0][index].v * ( f4_curr[2][pos[0]][pos[1]][pos[2]].v - f4_curr[2][pos[0]][pos[1]-shift[1]][pos[2]].v - f4_curr[1][pos[0]][pos[1]][pos[2]].v + f4_curr[1][pos[0]][pos[1]][pos[2]-1].v );

				// y-polarization
				f4_volt[1][pos[0]][pos[1]][pos[2]].v *= Op->f4_vv_Compressed[1][index].v;
				f4_volt[1][pos[0]][pos[1]][pos[2]].v += Op->f4_vi_Compressed[1][index].v * ( f4_curr[0][pos[0]][pos[1]][pos[2]].v - f4_curr[0][pos[0]][pos[1]][pos[2]-1].v - f4_curr[2][pos[0]][pos[1]][pos[2]].v + f4_curr[2][pos[0]-shift[0]][pos[1]][pos[2]].v);

				// z-polarization
				f4_volt[2][pos[0]][pos[1]][pos[2]].v *= Op->f4_vv_Compressed[2][index].v;
				f4_volt[2][pos[0]][pos[1]][pos[2]].v += Op->f4_vi_Compressed[2][index].v * ( f4_curr[1][pos[0]][pos[1]][pos[2]].v - f4_curr[1][pos[0]-shift[0]][pos[1]][pos[2]].v - f4_curr[0][pos[0]][pos[1]][pos[2]].v + f4_curr[0][pos[0]][pos[1]-shift[1]][pos[2]].v);
			}

			// for pos[2] = 0
			// x-polarization
			index = Op->m_Op_index[pos[0]][pos[1]][0];
			temp.f[0] = 0;
			temp.f[1] = f4_curr[1][pos[0]][pos[1]][numVectors-1].f[0];
			temp.f[2] = f4_curr[1][pos[0]][pos[1]][numVectors-1].f[1];
			temp.f[3] = f4_curr[1][pos[0]][pos[1]][numVectors-1].f[2];
			f4_volt[0][pos[0]][pos[1]][0].v *= Op->f4_vv_Compressed[0][index].v;
			f4_volt[0][pos[0]][pos[1]][0].v += Op->f4_vi_Compressed[0][index].v * ( f4_curr[2][pos[0]][pos[1]][0].v - f4_curr[2][pos[0]][pos[1]-shift[1]][0].v - f4_curr[1][pos[0]][pos[1]][0].v + temp.v );

			// y-polarization
			temp.f[0] = 0;
			temp.f[1] = f4_curr[0][pos[0]][pos[1]][numVectors-1].f[0];
			temp.f[2] = f4_curr[0][pos[0]][pos[1]][numVectors-1].f[1];
			temp.f[3] = f4_curr[0][pos[0]][pos[1]][numVectors-1].f[2];
			f4_volt[1][pos[0]][pos[1]][0].v *= Op->f4_vv_Compressed[1][index].v;
			f4_volt[1][pos[0]][pos[1]][0].v += Op->f4_vi_Compressed[1][index].v * ( f4_curr[0][pos[0]][pos[1]][0].v - temp.v - f4_curr[2][pos[0]][pos[1]][0].v + f4_curr[2][pos[0]-shift[0]][pos[1]][0].v);

			// z-polarization
			f4_volt[2][pos[0]][pos[1]][0].v *= Op->f4_vv_Compressed[2][index].v;
			f4_volt[2][pos[0]][pos[1]][0].v += Op->f4_vi_Compressed[2][index].v * ( f4_curr[1][pos[0]][pos[1]][0].v - f4_curr[1][pos[0]-shift[0]][pos[1]][0].v - f4_curr[0][pos[0]][pos[1]][0].v + f4_curr[0][pos[0]][pos[1]-shift[1]][0].v);
		}
		++pos[0];
	}
}

void Engine_SSE_Compressed::UpdateCurrents(unsigned int startX, unsigned int numX)
{
	unsigned int pos[3];
	f4vector temp;

	pos[0] = startX;
	unsigned int index;
	for (unsigned int posX=0;posX<numX;++posX)
	{
		for (pos[1]=0;pos[1]<numLines[1]-1;++pos[1])
		{
			for (pos[2]=0;pos[2]<numVectors-1;++pos[2])
			{
				index = Op->m_Op_index[pos[0]][pos[1]][pos[2]];
				// x-pol
				f4_curr[0][pos[0]][pos[1]][pos[2]].v *= Op->f4_ii_Compressed[0][index].v;
				f4_curr[0][pos[0]][pos[1]][pos[2]].v += Op->f4_iv_Compressed[0][index].v * ( f4_volt[2][pos[0]][pos[1]][pos[2]].v - f4_volt[2][pos[0]][pos[1]+1][pos[2]].v - f4_volt[1][pos[0]][pos[1]][pos[2]].v + f4_volt[1][pos[0]][pos[1]][pos[2]+1].v);

				// y-pol
				f4_curr[1][pos[0]][pos[1]][pos[2]].v *= Op->f4_ii_Compressed[1][index].v;
				f4_curr[1][pos[0]][pos[1]][pos[2]].v += Op->f4_iv_Compressed[1][index].v * ( f4_volt[0][pos[0]][pos[1]][pos[2]].v - f4_volt[0][pos[0]][pos[1]][pos[2]+1].v - f4_volt[2][pos[0]][pos[1]][pos[2]].v + f4_volt[2][pos[0]+1][pos[1]][pos[2]].v);

				// z-pol
				f4_curr[2][pos[0]][pos[1]][pos[2]].v *= Op->f4_ii_Compressed[2][index].v;
				f4_curr[2][pos[0]][pos[1]][pos[2]].v += Op->f4_iv_Compressed[2][index].v * ( f4_volt[1][pos[0]][pos[1]][pos[2]].v - f4_volt[1][pos[0]+1][pos[1]][pos[2]].v - f4_volt[0][pos[0]][pos[1]][pos[2]].v + f4_volt[0][pos[0]][pos[1]+1][pos[2]].v);
			}

			index = Op->m_Op_index[pos[0]][pos[1]][numVectors-1];
			// for pos[2] = numVectors-1
			// x-pol
			temp.f[0] = f4_volt[1][pos[0]][pos[1]][0].f[1];
			temp.f[1] = f4_volt[1][pos[0]][pos[1]][0].f[2];
			temp.f[2] = f4_volt[1][pos[0]][pos[1]][0].f[3];
			temp.f[3] = 0;
			f4_curr[0][pos[0]][pos[1]][numVectors-1].v *= Op->f4_ii_Compressed[0][index].v;
			f4_curr[0][pos[0]][pos[1]][numVectors-1].v += Op->f4_iv_Compressed[0][index].v * ( f4_volt[2][pos[0]][pos[1]][numVectors-1].v - f4_volt[2][pos[0]][pos[1]+1][numVectors-1].v - f4_volt[1][pos[0]][pos[1]][numVectors-1].v + temp.v);

			// y-pol
			temp.f[0] = f4_volt[0][pos[0]][pos[1]][0].f[1];
			temp.f[1] = f4_volt[0][pos[0]][pos[1]][0].f[2];
			temp.f[2] = f4_volt[0][pos[0]][pos[1]][0].f[3];
			temp.f[3] = 0;
			f4_curr[1][pos[0]][pos[1]][numVectors-1].v *= Op->f4_ii_Compressed[1][index].v;
			f4_curr[1][pos[0]][pos[1]][numVectors-1].v += Op->f4_iv_Compressed[1][index].v * ( f4_volt[0][pos[0]][pos[1]][numVectors-1].v - temp.v - f4_volt[2][pos[0]][pos[1]][numVectors-1].v + f4_volt[2][pos[0]+1][pos[1]][numVectors-1].v);

			// z-pol
			f4_curr[2][pos[0]][pos[1]][numVectors-1].v *= Op->f4_ii_Compressed[2][index].v;
			f4_curr[2][pos[0]][pos[1]][numVectors-1].v += Op->f4_iv_Compressed[2][index].v * ( f4_volt[1][pos[0]][pos[1]][numVectors-1].v - f4_volt[1][pos[0]+1][pos[1]][numVectors-1].v - f4_volt[0][pos[0]][pos[1]][numVectors-1].v + f4_volt[0][pos[0]][pos[1]+1][numVectors-1].v);
		}
		++pos[0];
	}
}
introducing compressed sse operator & engine use --engine=sse-compressed option to enable 2010-05-19 09:41:35 +00:00			`/*`
			`* Copyright (C) 2010 Thorsten Liebig (Thorsten.Liebig@gmx.de)`
			`*`
			`* This program is free software: you can redistribute it and/or modify`
			`* it under the terms of the GNU General Public License as published by`
			`* the Free Software Foundation, either version 3 of the License, or`
			`* (at your option) any later version.`
			`*`
			`* This program is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU General Public License`
			`* along with this program. If not, see <http://www.gnu.org/licenses/>.`
			`*/`

			`#include "engine_sse_compressed.h"`


			`Engine_SSE_Compressed* Engine_SSE_Compressed::New(const Operator_SSE_Compressed* op)`
			`{`
more console output regarding operator & engine usage 2010-05-19 19:25:15 +00:00			`cout << "Create FDTD engine (compressed SSE)" << endl;`
introducing compressed sse operator & engine use --engine=sse-compressed option to enable 2010-05-19 09:41:35 +00:00			`Engine_SSE_Compressed* e = new Engine_SSE_Compressed(op);`
			`e->Init();`
			`return e;`
			`}`

			`Engine_SSE_Compressed::Engine_SSE_Compressed(const Operator_SSE_Compressed* op) : Engine_sse(op)`
			`{`
			`Op = op;`
			`}`

			`Engine_SSE_Compressed::~Engine_SSE_Compressed()`
			`{`
			`}`

			`void Engine_SSE_Compressed::UpdateVoltages(unsigned int startX, unsigned int numX)`
			`{`
			`unsigned int pos[3];`
			`bool shift[2];`
			`f4vector temp;`

			`pos[0] = startX;`
			`unsigned int index=0;`
			`for (unsigned int posX=0;posX<numX;++posX)`
			`{`
			`shift[0]=pos[0];`
			`for (pos[1]=0;pos[1]<numLines[1];++pos[1])`
			`{`
			`shift[1]=pos[1];`
			`for (pos[2]=1;pos[2]<numVectors;++pos[2])`
			`{`
			`index = Op->m_Op_index[pos[0]][pos[1]][pos[2]];`
			`// x-polarization`
			`f4_volt[0][pos[0]][pos[1]][pos[2]].v *= Op->f4_vv_Compressed[0][index].v;`
			`f4_volt[0][pos[0]][pos[1]][pos[2]].v += Op->f4_vi_Compressed[0][index].v * ( f4_curr[2][pos[0]][pos[1]][pos[2]].v - f4_curr[2][pos[0]][pos[1]-shift[1]][pos[2]].v - f4_curr[1][pos[0]][pos[1]][pos[2]].v + f4_curr[1][pos[0]][pos[1]][pos[2]-1].v );`

			`// y-polarization`
			`f4_volt[1][pos[0]][pos[1]][pos[2]].v *= Op->f4_vv_Compressed[1][index].v;`
			`f4_volt[1][pos[0]][pos[1]][pos[2]].v += Op->f4_vi_Compressed[1][index].v * ( f4_curr[0][pos[0]][pos[1]][pos[2]].v - f4_curr[0][pos[0]][pos[1]][pos[2]-1].v - f4_curr[2][pos[0]][pos[1]][pos[2]].v + f4_curr[2][pos[0]-shift[0]][pos[1]][pos[2]].v);`

			`// z-polarization`
			`f4_volt[2][pos[0]][pos[1]][pos[2]].v *= Op->f4_vv_Compressed[2][index].v;`
			`f4_volt[2][pos[0]][pos[1]][pos[2]].v += Op->f4_vi_Compressed[2][index].v * ( f4_curr[1][pos[0]][pos[1]][pos[2]].v - f4_curr[1][pos[0]-shift[0]][pos[1]][pos[2]].v - f4_curr[0][pos[0]][pos[1]][pos[2]].v + f4_curr[0][pos[0]][pos[1]-shift[1]][pos[2]].v);`
			`}`

			`// for pos[2] = 0`
			`// x-polarization`
			`index = Op->m_Op_index[pos[0]][pos[1]][0];`
			`temp.f[0] = 0;`
			`temp.f[1] = f4_curr[1][pos[0]][pos[1]][numVectors-1].f[0];`
			`temp.f[2] = f4_curr[1][pos[0]][pos[1]][numVectors-1].f[1];`
			`temp.f[3] = f4_curr[1][pos[0]][pos[1]][numVectors-1].f[2];`
			`f4_volt[0][pos[0]][pos[1]][0].v *= Op->f4_vv_Compressed[0][index].v;`
			`f4_volt[0][pos[0]][pos[1]][0].v += Op->f4_vi_Compressed[0][index].v * ( f4_curr[2][pos[0]][pos[1]][0].v - f4_curr[2][pos[0]][pos[1]-shift[1]][0].v - f4_curr[1][pos[0]][pos[1]][0].v + temp.v );`

			`// y-polarization`
			`temp.f[0] = 0;`
			`temp.f[1] = f4_curr[0][pos[0]][pos[1]][numVectors-1].f[0];`
			`temp.f[2] = f4_curr[0][pos[0]][pos[1]][numVectors-1].f[1];`
			`temp.f[3] = f4_curr[0][pos[0]][pos[1]][numVectors-1].f[2];`
			`f4_volt[1][pos[0]][pos[1]][0].v *= Op->f4_vv_Compressed[1][index].v;`
			`f4_volt[1][pos[0]][pos[1]][0].v += Op->f4_vi_Compressed[1][index].v * ( f4_curr[0][pos[0]][pos[1]][0].v - temp.v - f4_curr[2][pos[0]][pos[1]][0].v + f4_curr[2][pos[0]-shift[0]][pos[1]][0].v);`

			`// z-polarization`
			`f4_volt[2][pos[0]][pos[1]][0].v *= Op->f4_vv_Compressed[2][index].v;`
			`f4_volt[2][pos[0]][pos[1]][0].v += Op->f4_vi_Compressed[2][index].v * ( f4_curr[1][pos[0]][pos[1]][0].v - f4_curr[1][pos[0]-shift[0]][pos[1]][0].v - f4_curr[0][pos[0]][pos[1]][0].v + f4_curr[0][pos[0]][pos[1]-shift[1]][0].v);`
			`}`
			`++pos[0];`
			`}`
			`}`

			`void Engine_SSE_Compressed::UpdateCurrents(unsigned int startX, unsigned int numX)`
			`{`
fix in multithreading linesPerThread Signed-off-by: Thorsten Liebig <Thorsten.Liebig@gmx.de> 2010-06-29 10:44:47 +00:00			`unsigned int pos[3];`
introducing compressed sse operator & engine use --engine=sse-compressed option to enable 2010-05-19 09:41:35 +00:00			`f4vector temp;`

			`pos[0] = startX;`
			`unsigned int index;`
			`for (unsigned int posX=0;posX<numX;++posX)`
			`{`
			`for (pos[1]=0;pos[1]<numLines[1]-1;++pos[1])`
			`{`
			`for (pos[2]=0;pos[2]<numVectors-1;++pos[2])`
			`{`
			`index = Op->m_Op_index[pos[0]][pos[1]][pos[2]];`
			`// x-pol`
			`f4_curr[0][pos[0]][pos[1]][pos[2]].v *= Op->f4_ii_Compressed[0][index].v;`
			`f4_curr[0][pos[0]][pos[1]][pos[2]].v += Op->f4_iv_Compressed[0][index].v * ( f4_volt[2][pos[0]][pos[1]][pos[2]].v - f4_volt[2][pos[0]][pos[1]+1][pos[2]].v - f4_volt[1][pos[0]][pos[1]][pos[2]].v + f4_volt[1][pos[0]][pos[1]][pos[2]+1].v);`

			`// y-pol`
			`f4_curr[1][pos[0]][pos[1]][pos[2]].v *= Op->f4_ii_Compressed[1][index].v;`
			`f4_curr[1][pos[0]][pos[1]][pos[2]].v += Op->f4_iv_Compressed[1][index].v * ( f4_volt[0][pos[0]][pos[1]][pos[2]].v - f4_volt[0][pos[0]][pos[1]][pos[2]+1].v - f4_volt[2][pos[0]][pos[1]][pos[2]].v + f4_volt[2][pos[0]+1][pos[1]][pos[2]].v);`

			`// z-pol`
			`f4_curr[2][pos[0]][pos[1]][pos[2]].v *= Op->f4_ii_Compressed[2][index].v;`
			`f4_curr[2][pos[0]][pos[1]][pos[2]].v += Op->f4_iv_Compressed[2][index].v * ( f4_volt[1][pos[0]][pos[1]][pos[2]].v - f4_volt[1][pos[0]+1][pos[1]][pos[2]].v - f4_volt[0][pos[0]][pos[1]][pos[2]].v + f4_volt[0][pos[0]][pos[1]+1][pos[2]].v);`
			`}`

			`index = Op->m_Op_index[pos[0]][pos[1]][numVectors-1];`
			`// for pos[2] = numVectors-1`
			`// x-pol`
			`temp.f[0] = f4_volt[1][pos[0]][pos[1]][0].f[1];`
			`temp.f[1] = f4_volt[1][pos[0]][pos[1]][0].f[2];`
			`temp.f[2] = f4_volt[1][pos[0]][pos[1]][0].f[3];`
			`temp.f[3] = 0;`
			`f4_curr[0][pos[0]][pos[1]][numVectors-1].v *= Op->f4_ii_Compressed[0][index].v;`
			`f4_curr[0][pos[0]][pos[1]][numVectors-1].v += Op->f4_iv_Compressed[0][index].v * ( f4_volt[2][pos[0]][pos[1]][numVectors-1].v - f4_volt[2][pos[0]][pos[1]+1][numVectors-1].v - f4_volt[1][pos[0]][pos[1]][numVectors-1].v + temp.v);`

			`// y-pol`
			`temp.f[0] = f4_volt[0][pos[0]][pos[1]][0].f[1];`
			`temp.f[1] = f4_volt[0][pos[0]][pos[1]][0].f[2];`
			`temp.f[2] = f4_volt[0][pos[0]][pos[1]][0].f[3];`
			`temp.f[3] = 0;`
			`f4_curr[1][pos[0]][pos[1]][numVectors-1].v *= Op->f4_ii_Compressed[1][index].v;`
			`f4_curr[1][pos[0]][pos[1]][numVectors-1].v += Op->f4_iv_Compressed[1][index].v * ( f4_volt[0][pos[0]][pos[1]][numVectors-1].v - temp.v - f4_volt[2][pos[0]][pos[1]][numVectors-1].v + f4_volt[2][pos[0]+1][pos[1]][numVectors-1].v);`

			`// z-pol`
			`f4_curr[2][pos[0]][pos[1]][numVectors-1].v *= Op->f4_ii_Compressed[2][index].v;`
			`f4_curr[2][pos[0]][pos[1]][numVectors-1].v += Op->f4_iv_Compressed[2][index].v * ( f4_volt[1][pos[0]][pos[1]][numVectors-1].v - f4_volt[1][pos[0]+1][pos[1]][numVectors-1].v - f4_volt[0][pos[0]][pos[1]][numVectors-1].v + f4_volt[0][pos[0]][pos[1]+1][numVectors-1].v);`
			`}`
			`++pos[0];`
			`}`
			`}`