NO

Author Topic: CUDA and OpenCL examples  (Read 4394 times)

Offline TimoVJL

  • Global Moderator
  • Member
  • *****
  • Posts: 2115
CUDA and OpenCL examples
« on: May 21, 2019, 05:39:20 PM »
This example:
Code: [Select]
// http://math.uaa.alaska.edu/~afkjm/cs448/handouts/cuda-firstprograms.pdf
#include "stdio.h"
#define N 10
 
__global__ void add(int *a, int *b, int *c)
{
        int tID = blockIdx.x;
        if (tID < N)
        {
                c[tID] = a[tID] + b[tID];
        }
}
 
int main()
{
        int a[N], b[N], c[N];
        int *dev_a, *dev_b, *dev_c;
 
        cudaMalloc((void **) &dev_a, N*sizeof(int));
        cudaMalloc((void **) &dev_b, N*sizeof(int));
        cudaMalloc((void **) &dev_c, N*sizeof(int));
 
        // Fill Arrays
        for (int i = 0; i < N; i++)
        {
                a[i] = i,
                b[i] = 1;
        }
 
        cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice);
        cudaMemcpy(dev_b, b, N*sizeof(int), cudaMemcpyHostToDevice);
 
        add<<<N,1>>>(dev_a, dev_b, dev_c);
 
        cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost);
 
        for (int i = 0; i < N; i++)
        {
                printf("%d + %d = %d\n", a[i], b[i], c[i]);
        }
        return 0;
}
converted to Pelles C
Code: [Select]
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
//#include "cuda.h"
#pragma comment(lib, "cuda.lib")
#pragma comment(lib, "msvcrt.lib")

#define CUDAAPI __stdcall
typedef int CUdevice;
typedef struct CUctx_st *CUcontext;
typedef struct CUmod_st *CUmodule;
typedef struct CUfunc_st *CUfunction;
typedef struct CUstream_st *CUstream;
#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) || defined(__aarch64__)
typedef unsigned long long CUdeviceptr;
#else
typedef unsigned int CUdeviceptr;
#endif
typedef enum cudaError_enum {
    CUDA_SUCCESS                              = 0,
    CUDA_ERROR_INVALID_VALUE                  = 1,
    CUDA_ERROR_OUT_OF_MEMORY                  = 2,
    CUDA_ERROR_NOT_INITIALIZED                = 3,
    CUDA_ERROR_DEINITIALIZED                  = 4,
    CUDA_ERROR_NO_DEVICE                      = 100,
    CUDA_ERROR_INVALID_DEVICE                 = 101,
    CUDA_ERROR_INVALID_IMAGE                  = 200,
    CUDA_ERROR_INVALID_CONTEXT                = 201,
    CUDA_ERROR_CONTEXT_ALREADY_CURRENT        = 202,
    CUDA_ERROR_MAP_FAILED                     = 205,
    CUDA_ERROR_UNMAP_FAILED                   = 206,
    CUDA_ERROR_ARRAY_IS_MAPPED                = 207,
    CUDA_ERROR_ALREADY_MAPPED                 = 208,
    CUDA_ERROR_NO_BINARY_FOR_GPU              = 209,
    CUDA_ERROR_ALREADY_ACQUIRED               = 210,
    CUDA_ERROR_NOT_MAPPED                     = 211,
    CUDA_ERROR_NOT_MAPPED_AS_ARRAY            = 212,
    CUDA_ERROR_NOT_MAPPED_AS_POINTER          = 213,
    CUDA_ERROR_ECC_UNCORRECTABLE              = 214,
    CUDA_ERROR_UNSUPPORTED_LIMIT              = 215,
    CUDA_ERROR_INVALID_SOURCE                 = 300,
    CUDA_ERROR_FILE_NOT_FOUND                 = 301,
    CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302,
    CUDA_ERROR_SHARED_OBJECT_INIT_FAILED      = 303,
    CUDA_ERROR_OPERATING_SYSTEM               = 304,
    CUDA_ERROR_INVALID_HANDLE                 = 400,
    CUDA_ERROR_NOT_FOUND                      = 500,
    CUDA_ERROR_NOT_READY                      = 600,
    CUDA_ERROR_LAUNCH_FAILED                  = 700,
    CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES        = 701,
    CUDA_ERROR_LAUNCH_TIMEOUT                 = 702,
    CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  = 703,
    CUDA_ERROR_UNKNOWN                        = 999
} CUresult;
CUresult CUDAAPI cuDriverGetVersion(int *driverVersion);
CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev);
CUresult CUDAAPI cuInit(unsigned int Flags);
CUresult CUDAAPI cuDeviceGetCount(int *count);
CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal);
CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
CUresult CUDAAPI cuCtxCreate_v2(CUcontext *pctx, unsigned int flags, CUdevice dev);
CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version);
CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev);
CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname);
CUresult CUDAAPI cuCtxDetach(CUcontext ctx);
CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr);
CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize);
CUresult CUDAAPI cuMemFree(CUdeviceptr dptr);
CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
CUresult CUDAAPI cuLaunchKernel(CUfunction f,
                   unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
                   unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
                   unsigned int sharedMemBytes, CUstream hStream,
                   void **kernelParams, void **extra);
CUresult CUDAAPI cuCtxSynchronize(void);

#define N 10

void __cdecl mainCRTStartup(void)
{
int deviceCount = 0;
int iDrvVer;
unsigned int iApiVer;
CUdevice dev;
CUcontext ctx;
CUmodule mod;
CUresult err;
CUfunction function;
int iMajor, iMinor;
char *perr;

cuDriverGetVersion(&iDrvVer);
printf("Driver Version: %d.%d\n", iDrvVer / 1000, iDrvVer % 1000);
if (!(err = cuInit(0)))
{
err = cuDeviceGetCount(&deviceCount);
err = cuDeviceGet(&dev, 0);
err = cuDeviceComputeCapability(&iMajor, &iMinor, dev);
printf("DeviceComputeCapability: %d.%d\n", iMajor, iMinor);
err = cuCtxCreate(&ctx, 0, dev);
err = cuCtxGetApiVersion(ctx, &iApiVer);
printf("API Version: %d.%d\n", iApiVer / 1000, iApiVer % 1000);
if (!err)
{
err = cuModuleLoad(&mod, "Add.ptx");
if (!err)
{
err = cuModuleGetFunction(&function, mod, "add");
if (!err)
{
int a[N], b[N], c[N];
// Fill Arrays
for (int i = 0; i < N; i++)
{
a[i] = i, b[i] = 1;
}

CUdeviceptr d_a, d_b, d_c;
//unsigned int nMemSize = sizeof(int) * N;
cuMemAlloc(&d_a, N*sizeof(int));
cuMemcpyHtoD(d_a, a, N*sizeof(int));
cuMemAlloc(&d_b, N*sizeof(int));
cuMemcpyHtoD(d_b, b, N*sizeof(int));
cuMemAlloc(&d_c, N*sizeof(int));
//int count = N;
void *args[] = { &d_a, &d_b, &d_c };
err = cuLaunchKernel(function, N, 1, 1, 1, 1, 1, 0, 0, &args[0], 0);
if (err) {
cuGetErrorString(err, &perr);
printf("error cuLaunchKernel: %u %s\n", err, perr);
}
err = cuCtxSynchronize();
cuMemcpyDtoH(c, d_c, N*sizeof(int));
cuMemFree(d_a);
cuMemFree(d_b);
cuMemFree(d_c);
for (int i = 0; i < N; i++)
{
//printf("c[%d] = %d\n", i, c[i]);
printf("%d + %d = %d\n", a[i], b[i], c[i]);
}
}
else
{
cuGetErrorString(err, &perr);
printf("error get function: %u %s\n", err, perr);
}
}
else
{
cuGetErrorString(err, &perr);
printf("error loading ptx: %u %s\n", err, perr);
}
err = cuCtxDetach(ctx);
}
}
ExitProcess(0);
}
create Add.ptx with cicc.exe

Add.ptx for an old GeForce G210 (GT 218) was created with nvopencc.exe
Code: [Select]
nvopencc.exe -TARG:compute_10 Add.iAdd.i
Code: [Select]
typedef struct  uint3
{
    unsigned int x, y, z;
}uint3;

uint3  extern const threadIdx;
uint3  extern const blockIdx;

//#define N 10
//__global__
__attribute__((global)) __attribute__((__used__))
void add(int *a, int *b, int *c)
{
 int tID = blockIdx.x;
 if (tID < 10)
 {
 c[tID] = a[tID] + b[tID];
 }
}
« Last Edit: May 25, 2019, 01:14:09 PM by TimoVJL »
May the source be with you