在B站上找到的视频,学习之后,自己总结了一下,归档留存以后使用
有人下载了CUDA Toolkit和cuDNN,但是我这边只安装了CUDA Toolkit就能在vs中运行代码了
CUDA Toolkit下载连接:https://developer.nvidia.com/cuda-toolkit-archive(我下载的是12.3,当时的最新版)
cuDNN Archive | NVIDIA Developer下载链接:https://developer.nvidia.com/rdp/cudnn-archive(我下载的是v8.9.5,能支持CUDA 12.x)
选择安装部件时,我是全选的,怕不全选安装不全,至少能用,不差这些空间
找到一个固定能找到的安装位置,此处是我安装的位置
安装完成之后就可以在vs中看到sample代码了
后面使用vs写代码:
3.配置库(记住!!配置时先改为debug和x64,releas版本应该也是同理)
4.配置静态链接库路径
至此配置完成,与我参考的帖子有些区别,但至少能跑代码了
下面在源文件处创建test.cu文件,这个代码是自带的测试程序,这个能跑了,cuda就差不多了
#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include <stdio.h>
cudaError_t addWithCuda(int* c, const int* a, const int* b, unsigned int size);
global void addKernel(int* c, const int* a, const int* b)
{
int i = threadIdx.x;
c[i] = a[i] + b[i];
}
int main()
{
const int arraySize = 5;
const int a[arraySize] = { 1, 2, 3, 4, 5 };
const int b[arraySize] = { 10, 20, 30, 40, 50 };
int c[arraySize] = { 0 };
// Add vectors in parallel.
cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “addWithCuda failed!”);
return 1;
}
printf(“{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d} ”,
c[0], c[1], c[2], c[3], c[4]);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaDeviceReset failed!”);
return 1;
}
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int* c, const int* a, const int* b, unsigned int size)
{
int* dev_a = 0;
int* dev_b = 0;
int* dev_c = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaSetDevice failed! Do you have a CUDA-capable GPU installed?”);
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaMalloc failed!”);
goto Error;
}
cudaStatus = cudaMalloc((void)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaMalloc failed!”);
goto Error;
}
cudaStatus = cudaMalloc((void)&dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaMalloc failed!”);
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaMemcpy failed!”);
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaMemcpy failed!”);
goto Error;
}
// Launch a kernel on the GPU with one thread for each element.
addKernel << <1, size >> > (dev_c, dev_a, dev_b);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “addKernel launch failed: %s ”, cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaDeviceSynchronize returned error code %d after launching addKernel! ”, cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaMemcpy failed!”);
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}
下面再给个代码,这个是查看显卡设备信息的代码
#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include <stdio.h>
int main() {
int deviceCount;
cudaGetDeviceCount(&deviceCount);
int dev;
for (dev = 0; dev < deviceCount; dev++)
{
int driver_version(0), runtime_version(0);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
if (dev == 0)
if (deviceProp.minor = 9999 && deviceProp.major == 9999)
printf(“ ”);
printf(“ Device%d:”%s“ ”, dev, deviceProp.name);
cudaDriverGetVersion(&driver_version);
printf(“CUDA驱动版本: %d.%d ”, driver_version / 1000, (driver_version % 1000) / 10);
cudaRuntimeGetVersion(&runtime_version);
printf(“CUDA运行时版本: %d.%d ”, runtime_version / 1000, (runtime_version % 1000) / 10);
printf(“设备计算能力: %d.%d ”, deviceProp.major, deviceProp.minor);
printf(“Total amount of Global Memory: 齝es ”, deviceProp.totalGlobalMem);
printf(“Number of SMs: %d ”, deviceProp.multiProcessorCount);
printf(“Total amount of Constant Memory: 齝es ”, deviceProp.totalConstMem);
printf(“Total amount of Shared Memory per block: 齝es ”, deviceProp.sharedMemPerBlock);
printf(“Total number of registers available per block: %d ”, deviceProp.regsPerBlock);
printf(“Warp size: %d ”, deviceProp.warpSize);
printf(“Maximum number of threads per SM: %d ”, deviceProp.maxThreadsPerMultiProcessor);
printf(“Maximum number of threads per block: %d ”, deviceProp.maxThreadsPerBlock);
printf(“Maximum size of each dimension of a block: %d x %d x %d ”, deviceProp.maxThreadsDim[0],
deviceProp.maxThreadsDim[1],
deviceProp.maxThreadsDim[2]);
printf(“Maximum size of each dimension of a grid: %d x %d x %d ”, deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]);
printf(“Maximum memory pitch: 齝es ”, deviceProp.memPitch);
printf(“Texture alignmemt: 齝es ”, deviceProp.texturePitchAlignment);
printf(“Clock rate: %.2f GHz ”, deviceProp.clockRate * 1e-6f);
printf(“Memory Clock rate: %.0f MHz ”, deviceProp.memoryClockRate * 1e-3f);
printf(“Memory Bus Width: %d-bit ”, deviceProp.memoryBusWidth);
}
return 0;
}
具体应该怎么写cuda代码还需要在C的基础上单独学习,主要是内存和并行核的控制
参考:
https://blog.csdn.net/ashiners/article/details/

版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容,请联系我们,一经查实,本站将立刻删除。
如需转载请保留出处:https://51itzy.com/kjqy/170282.html