首页 \ 问答 \ CUDA中的原子操作失败(Atomic Operation failed in CUDA)

CUDA中的原子操作失败(Atomic Operation failed in CUDA)

由于计算能力为2.1, atomicAddatomicMax操作不支持双精度,然后我根据堆栈溢出的一些答案定义这两个函数。

很奇怪atomicAdd函数运行良好,但atomicMax不起作用,这是我的代码。

我的代码测试是在每个块上生成随机数,然后对每个块上的随机数求和,我们有块和,我想在块和上测试atomicAddatomicMax

#include <iostream>
#include <curand.h>
#include <curand_kernel.h>
#include <stdio.h>
#include <stdlib.h>


#define num_of_blocks 2
#define threads_per_block 2
#define tot_threads 4


__device__ double gsum[num_of_blocks];

__device__ double dev_sum;

__device__ double dev_max;

// set seed for random number generator
__global__ void initcuRand(curandState* globalState, unsigned long seed){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    curand_init(seed, idx, 0, &globalState[idx]);
}

// atomiMax for double
__device__ double atomicMax_d(double* address, double val)
{
    unsigned long long int* address_as_i = (unsigned long long int*)address;
    unsigned long long int old = *address_as_i, assumed;
    do {
        assumed = old;
        old = ::atomicCAS(address_as_i, assumed, __double_as_longlong(::fmax(val, __longlong_as_double(assumed))));
    } while (assumed != old);
    return __longlong_as_double(old);
}

// atomicAdd for double
__device__ double atomicAdd_d(double* address, double val)
{
    unsigned long long int* address_as_ull = (unsigned long long int*)address;
    unsigned long long int old = *address_as_ull, assumed;
    do{
        assumed = old;
        old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
    }while(assumed != old);
    return __longlong_as_double(old);
}

__global__ void kernel(curandState *globalState){
    // global id
    int gidx    = threadIdx.x + blockIdx.x * blockDim.x;
    // local id
    int lidx    = threadIdx.x;

    // creat shared memory to store seeds
    __shared__ curandState localState[tot_threads];

    __shared__ double srandnum[threads_per_block];

    // copy global seed to local
    localState[lidx]    = globalState[gidx];

    //synchronize the local threads writing to the local memory cache
    __syncthreads();

    // generate random number from normal distribution in shared memory
    srandnum[lidx]  = curand_normal(&localState[lidx]);
    __syncthreads();

    if(lidx == 0){srandnum[lidx] += srandnum[lidx + 1];}   // sum of each block
    if(lidx == 0){gsum[blockIdx.x] = srandnum[lidx];}      // copy the sums back to global memory

    __threadfence();

    if( gidx < num_of_blocks){
        atomicAdd_d(&dev_sum, gsum[gidx]);
    }

    if( gidx < num_of_blocks){
        atomicMax_d(&dev_max, gsum[gidx]);
    }

    if( gidx == 0){
        printf("Sum is: %lf\n", dev_sum);
    }

    if( gidx == 1){
        printf("Max is: %lf\n", dev_max);
    }
}


int main(){
    // set seed on device
    curandState *globalState;
    cudaMalloc((void**)&globalState, tot_threads*sizeof(curandState));
    initcuRand<<<num_of_blocks, threads_per_block>>>(globalState, 1);

    // launch kernel
    kernel<<<num_of_blocks, threads_per_block>>>(globalState);
    double randnum[num_of_blocks];

    cudaMemcpyFromSymbol(randnum, gsum, num_of_blocks*sizeof(double), 0, cudaMemcpyDeviceToHost);

    std::cout << "Sum of each block:\n";
    for (int i = 0; i < num_of_blocks; ++i){
        std::cout << randnum[i] << std::endl;
    }

    cudaFree(globalState);
    return 0;
}

我得到的结果是

Sum is: -0.898329
Max is: 0.000000
Sum of each block:
-0.0152994
-0.88303

从结果中,我知道atomicAdd函数有效,但atomicMax函数不起作用,我不知道这一点。 先谢谢。


As the compute ability is 2.1, the atomicAdd and atomicMax operations do not support double precision, then I define both functions based on some answers on stack overflow.

It is strange that the atomicAdd function works well but the atomicMax doesn't work, here is my code.

The test of my code is to generate random number on each block, and then sum the random numbers on each block, we have block sum, I want to test the atomicAdd and atomicMax on the block sum.

#include <iostream>
#include <curand.h>
#include <curand_kernel.h>
#include <stdio.h>
#include <stdlib.h>


#define num_of_blocks 2
#define threads_per_block 2
#define tot_threads 4


__device__ double gsum[num_of_blocks];

__device__ double dev_sum;

__device__ double dev_max;

// set seed for random number generator
__global__ void initcuRand(curandState* globalState, unsigned long seed){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    curand_init(seed, idx, 0, &globalState[idx]);
}

// atomiMax for double
__device__ double atomicMax_d(double* address, double val)
{
    unsigned long long int* address_as_i = (unsigned long long int*)address;
    unsigned long long int old = *address_as_i, assumed;
    do {
        assumed = old;
        old = ::atomicCAS(address_as_i, assumed, __double_as_longlong(::fmax(val, __longlong_as_double(assumed))));
    } while (assumed != old);
    return __longlong_as_double(old);
}

// atomicAdd for double
__device__ double atomicAdd_d(double* address, double val)
{
    unsigned long long int* address_as_ull = (unsigned long long int*)address;
    unsigned long long int old = *address_as_ull, assumed;
    do{
        assumed = old;
        old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
    }while(assumed != old);
    return __longlong_as_double(old);
}

__global__ void kernel(curandState *globalState){
    // global id
    int gidx    = threadIdx.x + blockIdx.x * blockDim.x;
    // local id
    int lidx    = threadIdx.x;

    // creat shared memory to store seeds
    __shared__ curandState localState[tot_threads];

    __shared__ double srandnum[threads_per_block];

    // copy global seed to local
    localState[lidx]    = globalState[gidx];

    //synchronize the local threads writing to the local memory cache
    __syncthreads();

    // generate random number from normal distribution in shared memory
    srandnum[lidx]  = curand_normal(&localState[lidx]);
    __syncthreads();

    if(lidx == 0){srandnum[lidx] += srandnum[lidx + 1];}   // sum of each block
    if(lidx == 0){gsum[blockIdx.x] = srandnum[lidx];}      // copy the sums back to global memory

    __threadfence();

    if( gidx < num_of_blocks){
        atomicAdd_d(&dev_sum, gsum[gidx]);
    }

    if( gidx < num_of_blocks){
        atomicMax_d(&dev_max, gsum[gidx]);
    }

    if( gidx == 0){
        printf("Sum is: %lf\n", dev_sum);
    }

    if( gidx == 1){
        printf("Max is: %lf\n", dev_max);
    }
}


int main(){
    // set seed on device
    curandState *globalState;
    cudaMalloc((void**)&globalState, tot_threads*sizeof(curandState));
    initcuRand<<<num_of_blocks, threads_per_block>>>(globalState, 1);

    // launch kernel
    kernel<<<num_of_blocks, threads_per_block>>>(globalState);
    double randnum[num_of_blocks];

    cudaMemcpyFromSymbol(randnum, gsum, num_of_blocks*sizeof(double), 0, cudaMemcpyDeviceToHost);

    std::cout << "Sum of each block:\n";
    for (int i = 0; i < num_of_blocks; ++i){
        std::cout << randnum[i] << std::endl;
    }

    cudaFree(globalState);
    return 0;
}

The result I get is

Sum is: -0.898329
Max is: 0.000000
Sum of each block:
-0.0152994
-0.88303

From the result, I know that the atomicAdd function works but the atomicMax function doesn't work, I have no idea of this. Thanks beforehand.


原文:https://stackoverflow.com/questions/40836630
更新时间:2023-05-18 15:05

最满意答案

我不确定,但可能是错误发生,因为您没有验证架构中的“类别”。 尝试在“类别”中添加“blackbox:true”,以便它接受任何类型的对象。

Industry.attachSchema(new SimpleSchema({
    label: {
        type: String
    },
    value: {
        type: String
    },
    categories: {
        type: [Object],
        blackbox:true     // allows all objects
    }          
}));

完成后,尝试像这样添加值

var newObject = {
    id: categoryId,
    label: newCategory,
    value: newCategory
}
Industry.update({   
    _id: industryId
}, {
    $push: {
        categories: newObject     //newObject can be anything
    }
});

这将允许您将任何类型的对象添加到类别字段中。 但是你在评论中提到类别也是另一个集合。 如果您已经有类别的SimpleSchema,那么您可以验证categories字段以仅接受与SimpleSchema匹配的对象,如此类别

Industry.attachSchema(new SimpleSchema({
    label: {
        type: String
    },
    value: {
        type: String
    },
    categories: {
        type: [categoriesSchema]    // replace categoriesSchema by name of SimpleSchema for categories
    }          
}));

在这种情况下,只有与categoriesSchema匹配的对象才会被允许进入类别字段。 任何其他类型都将被过滤掉。 你也不会在控制台上遇到任何错误,试图插入其他类型。(这是我认为当你尝试现在插入因为没有指定验证时发生的事情)

编辑:答案的解释

在SimpleSchema中,当您定义一个对象数组时,您必须验证它,即,您必须告诉它它可以接受哪些对象以及它不能接受哪些对象。 例如,当你定义它时

...
categories: {
    type: [categoriesSchema]    // Correct
} 

这意味着只能在其中插入与名为categoriesSchema的另一个SimpleSchema结构相似的对象。 根据您的示例,您尝试插入的任何对象都应采用此格式

{
    id: categoryId,
    label: newCategory,
    value: newCategory
}

插入时将拒绝任何不具有此格式的对象。 这就是为什么你尝试插入的所有对象,当你最初尝试使用这样结构化的架构时,这些对象被拒绝

...
categories: {
    type: [Object]     // Not correct as there is no SimpleSchema named 'Object' to match with
}

黑盒:真

现在,假设您没有要过滤的对象,并希望在未经验证的情况下插入所有对象。 这就是设置“blackbox:true”的地方。如果你定义了这样的字段

...
categories: {
    type: [Object],     // Correct
    blackbox:true
}

这意味着类别可以是任何对象,不需要针对其他一些SimpleSchema进行验证。 所以无论你试图插入什么,都会被接受。


I am not sure but maybe the error is occuring because you are not validating 'categories' in your schema. Try adding a 'blackbox:true' to your 'categories' so that it accepts any types of objects.

Industry.attachSchema(new SimpleSchema({
    label: {
        type: String
    },
    value: {
        type: String
    },
    categories: {
        type: [Object],
        blackbox:true     // allows all objects
    }          
}));

Once you've done that try adding values to it like this

var newObject = {
    id: categoryId,
    label: newCategory,
    value: newCategory
}
Industry.update({   
    _id: industryId
}, {
    $push: {
        categories: newObject     //newObject can be anything
    }
});

This would allow you to add any kind of object into the categories field. But you mentioned in a comment that categories is also another collection. If you already have a SimpleSchema for categories then you could validate the categories field to only accept objects that match with the SimpleSchema for categories like this

Industry.attachSchema(new SimpleSchema({
    label: {
        type: String
    },
    value: {
        type: String
    },
    categories: {
        type: [categoriesSchema]    // replace categoriesSchema by name of SimpleSchema for categories
    }          
}));

In this case only objects that match categoriesSchema will be allowed into categories field. Any other type would be filtered out. Also you wouldnt get any error on console for trying to insert other types.(which is what i think is happening when you try to insert now as no validation is specified)

EDIT : EXPLANATION OF ANSWER

In a SimpleSchema when you define an array of objects you have to validate it,ie, you have to tell it what objects it can accept and what it can't. For example when you define it like

...
categories: {
    type: [categoriesSchema]    // Correct
} 

it means that objects that are similar in structure to those in another SimpleSchema named categoriesSchema only can be inserted into it. According to your example any object you try to insert should be of this format

{
    id: categoryId,
    label: newCategory,
    value: newCategory
}

Any object that isn't of this format will be rejected while insert. Thats why all objects you tried to insert where rejected when you tried initially with your schema structured like this

...
categories: {
    type: [Object]     // Not correct as there is no SimpleSchema named 'Object' to match with
}

Blackbox:true

Now, lets say you don't what your object to be filtered and want all objects to be inserted without validation. Thats where setting "blackbox:true" comes in. If you define a field like this

...
categories: {
    type: [Object],     // Correct
    blackbox:true
}

it means that categories can be any object and need not be validated with respect to some other SimpleSchema. So whatever you try to insert gets accepted.

相关问答

更多

相关文章

更多

最新问答

更多
  • h2元素推动其他h2和div。(h2 element pushing other h2 and div down. two divs, two headers, and they're wrapped within a parent div)
  • 创建一个功能(Create a function)
  • 我投了份简历,是电脑编程方面的学徒,面试时说要培训三个月,前面
  • PDO语句不显示获取的结果(PDOstatement not displaying fetched results)
  • Qt冻结循环的原因?(Qt freezing cause of the loop?)
  • TableView重复youtube-api结果(TableView Repeating youtube-api result)
  • 如何使用自由职业者帐户登录我的php网站?(How can I login into my php website using freelancer account? [closed])
  • SQL Server 2014版本支持的最大数据库数(Maximum number of databases supported by SQL Server 2014 editions)
  • 我如何获得DynamicJasper 3.1.2(或更高版本)的Maven仓库?(How do I get the maven repository for DynamicJasper 3.1.2 (or higher)?)
  • 以编程方式创建UITableView(Creating a UITableView Programmatically)
  • 如何打破按钮上的生命周期循环(How to break do-while loop on button)
  • C#使用EF访问MVC上的部分类的自定义属性(C# access custom attributes of a partial class on MVC with EF)
  • 如何获得facebook app的publish_stream权限?(How to get publish_stream permissions for facebook app?)
  • 如何防止调用冗余函数的postgres视图(how to prevent postgres views calling redundant functions)
  • Sql Server在欧洲获取当前日期时间(Sql Server get current date time in Europe)
  • 设置kotlin扩展名(Setting a kotlin extension)
  • 如何并排放置两个元件?(How to position two elements side by side?)
  • 如何在vim中启用python3?(How to enable python3 in vim?)
  • 在MySQL和/或多列中使用多个表用于Rails应用程序(Using multiple tables in MySQL and/or multiple columns for a Rails application)
  • 如何隐藏谷歌地图上的登录按钮?(How to hide the Sign in button from Google maps?)
  • Mysql左连接旋转90°表(Mysql Left join rotate 90° table)
  • dedecms如何安装?
  • 在哪儿学计算机最好?
  • 学php哪个的书 最好,本人菜鸟
  • 触摸时不要突出显示表格视图行(Do not highlight table view row when touched)
  • 如何覆盖错误堆栈getter(How to override Error stack getter)
  • 带有ImageMagick和许多图像的GIF动画(GIF animation with ImageMagick and many images)
  • USSD INTERFACE - > java web应用程序通信(USSD INTERFACE -> java web app communication)
  • 电脑高中毕业学习去哪里培训
  • 正则表达式验证SMTP响应(Regex to validate SMTP Responses)