Research Article
GPU Preconditioning for Block Linear Systems Using Block Incomplete Sparse Approximate Inverses
Algorithm 2
GPU kernel for solving LGSBTS.
| (1) | __shared__ double sm[warps_per_block][warp_size]; | | (2) | __shared__ double inv[warps_per_block][bsz][bsz]; | | (3) | __shared__ double ssol[warps_per_block][bsz][bsz]; | | (4) | tid = threadIdx.x + blockIdx.x blockDim.x; | | (5) | wpid = threadIdx.x/warp_size; | | (6) | gwpid = tid/warp_size; | | (7) | rpos = threadIdx.x%warp size; | | (8) | range = ; | | (9) | if gwpid < nbr then | | (10) | if rpos < range then | | (11) | r = rpos%bsz; | | (12) | c = (rpos/bsz)%bsz; | | (13) | InvStIdx = NLColPtr[gwpid]; | | (14) | InvEdIdx = NLColPtr[gwpid + 1]; | | (15) | for idx = InvStIdx to InvEdIdx − 1 do | | (16) | irow = NLRowVal[idx]; | | (17) | if rpos < then | | (18) | tmpidx = LRowPtr[irow + 1] − 1; | | (19) | ssol[wpid][r][c] = L[tmpidx + rpos]; | | (20) | inv[wpid][r][c] = 1.0/det ; | | (21) | sm[wpid][rpos] = Rhs[idx + rpos]; | | (22) | ssol[wpid][r][c] = ; | | (23) | NL[idx + rpos] = ssol[wpid][r][c]; | | (24) | end if | | (25) | myidx = idx + 1 + ; | | (26) | triEdIdx = triStIdx + (n − icol) ; | | (27) | solIdx = solStIdx + + rpos; | | (28) | while myidx < InvEdIdx do | | (29) | sm[wpid][rpos] = 0.0; | | (30) | myrow = NLRowVal[myidx]; | | (31) | tmpidx = LRowPtr[myrow + 1] − 1; | | (32) | while tmpidx LRowPtr[myrow] && LColVal[tmpidx] == irow do | | (33) | tmpidx−−; | | (34) | end while | | (35) | if tmpidx LRowPtr[myrow] && LColVal[tmpidx] == irow then | | (36) | s[wpid][lane] = L[tmpidx + rpos%]; | | (37) | end if | | (38) | Rhs[myidx + lane%]− = ; | | (39) | myidx+ = warp_size/; | | (40) | end while | | (41) | end for | | (42) | end if | | (43) | end if |
|