Application Article
A High-Performance Parallel FDTD Method Enhanced by Using SSE Instruction Set
| for(i = 0; i <= nx; i ++)
{
| | vHi_Coeff = _mm_load1_ps(& _Coeff); | | //load single float value to vector | | for(j = 0; j <= ny; j++)
{
| | vHj_Coeff = _mm_load1_ps(& _Coeff); | | //load single float value to vector | | = ( 128 *) [i][j]; | | = ( 128 *) [i][j]; | | = ( 128 *) [i][j]; | | _minus = ( 128 *) [i-1][j]; | | _minus = ( 128 *) [i][j-1]; | | for(k = 0, vk = 0; k < nz; k += 4, vk ++)
{
| | vEk_Coeff = _mm_load1_ps(&Ek_Coeff); | | xmm0 = _mm_sub_ps( [vk], vHx_minus [vk]); | | xmm0 = _mm_mul_ps( _Coeff, xmm0); | | xmm1 = _mm_sub_ps( [vk], vHy_minus [vk] ); | | xmm1 = _mm_mul_ps( _Coeff, xmm1 ); | | xmm0 = _mm_sub_ps(xmm1, xmm0 ); | | xmm1 = _mm_mul_ps(vEk [vk], vEk_Coeff); | | vEk [vk] = _mm_add_ps(xmm1, xmm0); | |
|