This example:
// http://math.uaa.alaska.edu/~afkjm/cs448/handouts/cuda-firstprograms.pdf
#include "stdio.h"
#define N 10
__global__ void add(int *a, int *b, int *c)
{
int tID = blockIdx.x;
if (tID < N)
{
c[tID] = a[tID] + b[tID];
}
}
int main()
{
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
cudaMalloc((void **) &dev_a, N*sizeof(int));
cudaMalloc((void **) &dev_b, N*sizeof(int));
cudaMalloc((void **) &dev_c, N*sizeof(int));
// Fill Arrays
for (int i = 0; i < N; i++)
{
a[i] = i,
b[i] = 1;
}
cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N*sizeof(int), cudaMemcpyHostToDevice);
add<<<N,1>>>(dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++)
{
printf("%d + %d = %d\n", a[i], b[i], c[i]);
}
return 0;
}
converted to Pelles C#define WIN32_LEAN_AND_MEAN
#include <windows.h>
//#include "cuda.h"
#pragma comment(lib, "cuda.lib")
#pragma comment(lib, "msvcrt.lib")
#define CUDAAPI __stdcall
typedef int CUdevice;
typedef struct CUctx_st *CUcontext;
typedef struct CUmod_st *CUmodule;
typedef struct CUfunc_st *CUfunction;
typedef struct CUstream_st *CUstream;
#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) || defined(__aarch64__)
typedef unsigned long long CUdeviceptr;
#else
typedef unsigned int CUdeviceptr;
#endif
typedef enum cudaError_enum {
CUDA_SUCCESS = 0,
CUDA_ERROR_INVALID_VALUE = 1,
CUDA_ERROR_OUT_OF_MEMORY = 2,
CUDA_ERROR_NOT_INITIALIZED = 3,
CUDA_ERROR_DEINITIALIZED = 4,
CUDA_ERROR_NO_DEVICE = 100,
CUDA_ERROR_INVALID_DEVICE = 101,
CUDA_ERROR_INVALID_IMAGE = 200,
CUDA_ERROR_INVALID_CONTEXT = 201,
CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202,
CUDA_ERROR_MAP_FAILED = 205,
CUDA_ERROR_UNMAP_FAILED = 206,
CUDA_ERROR_ARRAY_IS_MAPPED = 207,
CUDA_ERROR_ALREADY_MAPPED = 208,
CUDA_ERROR_NO_BINARY_FOR_GPU = 209,
CUDA_ERROR_ALREADY_ACQUIRED = 210,
CUDA_ERROR_NOT_MAPPED = 211,
CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212,
CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213,
CUDA_ERROR_ECC_UNCORRECTABLE = 214,
CUDA_ERROR_UNSUPPORTED_LIMIT = 215,
CUDA_ERROR_INVALID_SOURCE = 300,
CUDA_ERROR_FILE_NOT_FOUND = 301,
CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302,
CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303,
CUDA_ERROR_OPERATING_SYSTEM = 304,
CUDA_ERROR_INVALID_HANDLE = 400,
CUDA_ERROR_NOT_FOUND = 500,
CUDA_ERROR_NOT_READY = 600,
CUDA_ERROR_LAUNCH_FAILED = 700,
CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701,
CUDA_ERROR_LAUNCH_TIMEOUT = 702,
CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703,
CUDA_ERROR_UNKNOWN = 999
} CUresult;
CUresult CUDAAPI cuDriverGetVersion(int *driverVersion);
CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev);
CUresult CUDAAPI cuInit(unsigned int Flags);
CUresult CUDAAPI cuDeviceGetCount(int *count);
CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal);
CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
CUresult CUDAAPI cuCtxCreate_v2(CUcontext *pctx, unsigned int flags, CUdevice dev);
CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version);
CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev);
CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname);
CUresult CUDAAPI cuCtxDetach(CUcontext ctx);
CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr);
CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize);
CUresult CUDAAPI cuMemFree(CUdeviceptr dptr);
CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
CUresult CUDAAPI cuLaunchKernel(CUfunction f,
unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
unsigned int sharedMemBytes, CUstream hStream,
void **kernelParams, void **extra);
CUresult CUDAAPI cuCtxSynchronize(void);
#define N 10
void __cdecl mainCRTStartup(void)
{
int deviceCount = 0;
int iDrvVer;
unsigned int iApiVer;
CUdevice dev;
CUcontext ctx;
CUmodule mod;
CUresult err;
CUfunction function;
int iMajor, iMinor;
char *perr;
cuDriverGetVersion(&iDrvVer);
printf("Driver Version: %d.%d\n", iDrvVer / 1000, iDrvVer % 1000);
if (!(err = cuInit(0)))
{
err = cuDeviceGetCount(&deviceCount);
err = cuDeviceGet(&dev, 0);
err = cuDeviceComputeCapability(&iMajor, &iMinor, dev);
printf("DeviceComputeCapability: %d.%d\n", iMajor, iMinor);
err = cuCtxCreate(&ctx, 0, dev);
err = cuCtxGetApiVersion(ctx, &iApiVer);
printf("API Version: %d.%d\n", iApiVer / 1000, iApiVer % 1000);
if (!err)
{
err = cuModuleLoad(&mod, "Add.ptx");
if (!err)
{
err = cuModuleGetFunction(&function, mod, "add");
if (!err)
{
int a[N], b[N], c[N];
// Fill Arrays
for (int i = 0; i < N; i++)
{
a[i] = i, b[i] = 1;
}
CUdeviceptr d_a, d_b, d_c;
//unsigned int nMemSize = sizeof(int) * N;
cuMemAlloc(&d_a, N*sizeof(int));
cuMemcpyHtoD(d_a, a, N*sizeof(int));
cuMemAlloc(&d_b, N*sizeof(int));
cuMemcpyHtoD(d_b, b, N*sizeof(int));
cuMemAlloc(&d_c, N*sizeof(int));
//int count = N;
void *args[] = { &d_a, &d_b, &d_c };
err = cuLaunchKernel(function, N, 1, 1, 1, 1, 1, 0, 0, &args[0], 0);
if (err) {
cuGetErrorString(err, &perr);
printf("error cuLaunchKernel: %u %s\n", err, perr);
}
err = cuCtxSynchronize();
cuMemcpyDtoH(c, d_c, N*sizeof(int));
cuMemFree(d_a);
cuMemFree(d_b);
cuMemFree(d_c);
for (int i = 0; i < N; i++)
{
//printf("c[%d] = %d\n", i, c[i]);
printf("%d + %d = %d\n", a[i], b[i], c[i]);
}
}
else
{
cuGetErrorString(err, &perr);
printf("error get function: %u %s\n", err, perr);
}
}
else
{
cuGetErrorString(err, &perr);
printf("error loading ptx: %u %s\n", err, perr);
}
err = cuCtxDetach(ctx);
}
}
ExitProcess(0);
}
create Add.ptx with cicc.exe
Add.ptx for an old GeForce G210 (GT 218) was created with nvopencc.exenvopencc.exe -TARG:compute_10 Add.i
Add.itypedef struct uint3
{
unsigned int x, y, z;
}uint3;
uint3 extern const threadIdx;
uint3 extern const blockIdx;
//#define N 10
//__global__
__attribute__((global)) __attribute__((__used__))
void add(int *a, int *b, int *c)
{
int tID = blockIdx.x;
if (tID < 10)
{
c[tID] = a[tID] + b[tID];
}
}