| () ⋯ |
| () cudaArray *cuda_arrays = (cudaArray**) malloc (num_of_stts * sizeof(cudaArray*)) |
| () cudaTextureObject_t* textObj = (cudaTextureObject_t*) malloc (num_of_stts * sizeof |
| (cudaTextureObject_t)); |
| () for (int i = 0; i < num_of_stts; i++) |
| () cuda_arrays[i] = generate_cuda_array (get_dfa_matrix(i), get_dfa_width(i), |
| get_dfa_height(i)); |
| () textObj[i] = generate_texture_objects (cuda_arrays[i]); |
| () |
| () ⋯ |
| () cudaStream_t *streams = |
| () (cudaStream_t*) malloc (nstreams * sizeof(cudaStream_t)); |
| () create multiple CUDA streams |
| () for (int i = 0; i < nstreams; i++) |
| () cudaStreamCreate (&(streams[i])); |
| () |
| () //copy data to GPU memory, each stream copies one segment |
| () for (int i = 0; i < nstreams; i++) |
| () long in_offset = i * input_len/nstreams; |
| () cudaMemcpyAsync (d_input + in_offset, h_input + in_offset, input_len * sizeof |
| (char)/nstreams, cudaMemcpyHostToDevice, streams[i]); |
| () |
| () each stream processes input data with each dfa (texObj [i]) |
| () for (int i = 0; i < nstreams; i++) |
| () matching <<<blocks, threads, sm_size, streams[i]>>>(texObj[i], d_input, |
| input_len, pattern_max_len, d_output, output_len); |
| () |
| () copy results back to host CPU, each stream copies one segment |
| () for (int i = 0; i < nstreams; i++) |
| () long out_offset = i * output_len/nstreams; |
| () cudaMemcpyAsync (h_output + out_offset, d_output + out_offset, output_len * sizeof |
| (int)/nstreams, cudaMemcpyDeviceToHost); |
| () |
| () for (int i = 0; i < nstreams; i++) |
| () cudaStreamDestroy (streams[i]); |
| () |
| () ⋯ |