Research Article
Multi-GPU Support on Single Node Using Directive-Based Programming Model
Algorithm 3
A multi-GPU implementation of MM using hybrid model.
| omp_set_num_threads(threads); | | #pragma omp parallel | | { | | int i, j, k; | | int id, blocks, start, end; | | id = omp_get_thread_num(); | | blocks = n/threads; | | start = idblocks; | | end = (id+1)blocks; | | acc_set_device_num(id+1, acc_device_not_host); | | #pragma acc data copyin(A[startn:blocksn]) | | copyin(B[0:nn]) | | copyout(C[startn:blocksn]) | | { | | #pragma acc parallel num_gangs(32) vector_length(32) | | { | | #pragma acc loop gang | | for(i=start; i<end; i++){ | | #pragma acc loop vector | | for(j=0; j<n; j++){ | | float c = 0.0f; | | for(k=0; k<n; k++) | | c += A[in+k] B[kn+j]; | | C[in+j] = c; | | } | | } | | } | |
|