Advertisement
Guest User

Untitled

a guest
Jun 9th, 2025
29
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 2.16 KB | None | 0 0
  1. // ======================= my_template.h =======================
  2. #ifndef MY_TEMPLATE_H
  3. #define MY_TEMPLATE_H
  4.  
  5. #include <iostream>
  6.  
  7. // General template class (non-CUDA, default implementation)
  8. template<typename T>
  9. class MyTemplate {
  10. public:
  11.     void compute() {
  12.         std::cout << "Default CPU compute for generic type\n";
  13.         // Default implementation (e.g., CPU-only)
  14.     }
  15. };
  16.  
  17. // Forward declaration of specialization for float
  18. template<>
  19. class MyTemplate<float>;
  20.  
  21. #endif // MY_TEMPLATE_H
  22.  
  23. // =================== my_template_float.cu ====================
  24.  
  25. #include <iostream>
  26. #include "my_template.h"
  27. #include <cuda_runtime.h>
  28.  
  29. // Simple CUDA kernel for demonstration
  30. __global__ void computeKernelFloat(float* data, int size) {
  31.     int idx = blockIdx.x * blockDim.x + threadIdx.x;
  32.     if (idx < size) {
  33.         data[idx] = data[idx] * 2.0f;  // Example operation
  34.     }
  35. }
  36.  
  37. // Specialization of MyTemplate for float using CUDA
  38. template<>
  39. class MyTemplate<float> {
  40. public:
  41.     void compute() {
  42.         std::cout << "CUDA-accelerated compute for float\n";
  43.  
  44.         const int size = 1024;
  45.         float* h_data = new float[size];
  46.         for (int i = 0; i < size; ++i) h_data[i] = static_cast<float>(i);
  47.  
  48.         float* d_data;
  49.         cudaMalloc(&d_data, size * sizeof(float));
  50.         cudaMemcpy(d_data, h_data, size * sizeof(float), cudaMemcpyHostToDevice);
  51.  
  52.         int threads = 256;
  53.         int blocks = (size + threads - 1) / threads;
  54.         computeKernelFloat<<<blocks, threads>>>(d_data, size);
  55.         cudaDeviceSynchronize();  // Ensure kernel is finished
  56.  
  57.         cudaMemcpy(h_data, d_data, size * sizeof(float), cudaMemcpyDeviceToHost);
  58.  
  59.         std::cout << "Sample result: " << h_data[0] << ", " << h_data[1] << "\n";
  60.  
  61.         cudaFree(d_data);
  62.         delete[] h_data;
  63.     }
  64. };
  65.  
  66. // Explicit instantiation
  67. template class MyTemplate<float>;
  68.  
  69. // ========================= main.cpp ==========================
  70.  
  71. #include "my_template.h"
  72.  
  73. int main() {
  74.     MyTemplate<int> obj_int;
  75.     obj_int.compute();  // CPU fallback
  76.  
  77.     MyTemplate<float> obj_float;
  78.     obj_float.compute();  // CUDA-accelerated
  79.  
  80.     return 0;
  81. }
  82.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement