📜  Hillis Steele 扫描(并行前缀扫描算法)

📅  最后修改于: 2022-05-13 01:55:00.308000             🧑  作者: Mango

Hillis Steele 扫描(并行前缀扫描算法)

在本文中,讨论了一种称为 Hillis-Steele 扫描的扫描算法,也称为并行前缀扫描算法。在此上下文中的扫描操作本质上是指计算数组的前缀和。 Hillis-Steele 扫描是一种用于以并行方式运行的扫描操作的算法。以下是数组 x[]大小为N的算法的方法:

  • 使用变量d在范围[1, log 2 (N)] 中迭代,对于所有k并行,检查k的值是否至少为 2 d 。如果发现为真,则将x[k – 2 d – 1 ]的值与值x[k]相加。

视觉表现:

当深度d达到log 2 N 时,计算终止,结果计算为数组的前缀和。所有单独的附加操作并行运行,每一层(d = 1, d = 2, ..., )线性进行。

下面是该算法在 CUDA C++ 中的实现:

C++
// C++ program for the above approach
 
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include 
#include 
#include 
#include 
using namespace std::chrono;
using namespace std;
 
// Function to handle error
static void HandleError(cudaError_t err,
                        const char* file,
                        int line)
{
    // If the error occurs then print
    // that error
    if (err != cudaSuccess) {
        printf("\n%s in %s at line %d\n",
               cudaGetErrorString(err),
               file, line);
 
        // Exit
        exit(EXIT_FAILURE);
    }
}
 
#define HANDLE_ERROR(err) (
HandleError(err, __FILE__, __LINE__))
 
template 
__global__ void
Hillis_Steele_Scan_Kernel(T* arr,
 __int64 space,
                          __int64 step,
 __int64 steps)
{
    __int64 x = threadIdx.x
                + blockDim.x * blockIdx.x;
 
    __int64 y = threadIdx.y
                + blockDim.y * blockIdx.y;
 
    // 2D Kernel Launch parameters
    __int64 tid = x + (y * gridDim.x
                       * blockDim.x);
 
    // Kernel runs in the parallel
    // TID is the unique thread ID
    if (tid >= space)
        arr[tid] += arr[tid - space];
}
 
template 
T* Hillis_Steele_Scan(T* input, __int64 N)
{
    __int64* out;
    HANDLE_ERROR(
        cudaMallocManaged(&out,
                          (sizeof(__int64) * N)));
 
    // 2D Kernel Launch Parameters
    dim3 THREADS(1024, 1, 1);
    dim3 BLOCKS;
    if (N >= 65536)
        BLOCKS = dim3(64, N / 65536, 1);
    else if (N <= 1024)
        BLOCKS = dim3(1, 1, 1);
    else
        BLOCKS = dim3(N / 1024, 1, 1);
 
    __int64 space = 1;
 
    // Begin with a stride of 2^0
    __int64 steps = __int64(log2(float(N)));
 
    // Log2N depth dependency of scan
    HANDLE_ERROR(cudaMemcpy(
        out, input, sizeof(__int64) * N,
        cudaMemcpyDeviceToDevice));
 
    // Copy Input Array to Output Array
    for (size_t step = 0;
         step < steps; step++) {
        Hillis_Steele_Scan_Kernel<< > >(
            out, space, step, steps);
 
        // Calls the parallel operation
        space *= 2;
 
        // A[i] += A[i - stride]
        // log N times where N
        // is array size
    }
 
    cudaDeviceSynchronize();
 
    return out;
}
 
// Driver Code
int main()
{
    __int64* inputArr;
    __int64 arraysize = 10;
 
    // Size of the input array
    __int64 N = __int64(1)
                << (__int64(log2(float(arraysize))) + 1);
 
    // N is the nearest power of 2
    // to the array size
    cout << "\n\nELEMS --> 2^" << N
         << " >= " << arraysize;
 
    // Allocate memory on the GPU
    HANDLE_ERROR(cudaMallocManaged(&inputArr,
                                   (sizeof(__int64) * N)));
 
    HANDLE_ERROR(cudaDeviceSynchronize());
 
    // INIT Test Data
    for (__int64 i = 0; i < N; i++) {
        inputArr[i] = 1;
    }
 
    // An array with only 1s was chosen
    // as test data so the result is
    // 1, 2, 3, 4, ..., N
    high_resolution_clock::time_point tg1
        = high_resolution_clock::now();
 
    __int64* out = Hillis_Steele_Scan(
        inputArr, N);
 
    // Function Call
    high_resolution_clock::time_point tg2
        = high_resolution_clock::now();
 
    duration time_span
        = duration_cast >(tg2 - tg1);
 
    cout << "\nTime Taken : "
         << time_span.count() * 1000
         << " ms";
 
    cout << endl;
    for (__int64 i = 0; i < arraysize; i++)
        std::cout << '\t' << out[i];
    std::cout << std::endl;
 
    cudaFree(out);
 
    // Free allocated memory from GPU
    cudaFree(inputArr);
 
    return 0;
}


复杂度分析: O(log N) 时间和 O(N) 个处理器

想要从精选的视频和练习题中学习,请查看C++ 基础课程,从基础到高级 C++ 和C++ STL 课程,了解语言和 STL。要完成从学习语言到 DS Algo 等的准备工作,请参阅完整的面试准备课程