Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // ======================= my_template.h =======================
- #ifndef MY_TEMPLATE_H
- #define MY_TEMPLATE_H
- #include <iostream>
- // General template class (non-CUDA, default implementation)
- template<typename T>
- class MyTemplate {
- public:
- void compute() {
- std::cout << "Default CPU compute for generic type\n";
- // Default implementation (e.g., CPU-only)
- }
- };
- // Forward declaration of specialization for float
- template<>
- class MyTemplate<float>;
- #endif // MY_TEMPLATE_H
- // =================== my_template_float.cu ====================
- #include <iostream>
- #include "my_template.h"
- #include <cuda_runtime.h>
- // Simple CUDA kernel for demonstration
- __global__ void computeKernelFloat(float* data, int size) {
- int idx = blockIdx.x * blockDim.x + threadIdx.x;
- if (idx < size) {
- data[idx] = data[idx] * 2.0f; // Example operation
- }
- }
- // Specialization of MyTemplate for float using CUDA
- template<>
- class MyTemplate<float> {
- public:
- void compute() {
- std::cout << "CUDA-accelerated compute for float\n";
- const int size = 1024;
- float* h_data = new float[size];
- for (int i = 0; i < size; ++i) h_data[i] = static_cast<float>(i);
- float* d_data;
- cudaMalloc(&d_data, size * sizeof(float));
- cudaMemcpy(d_data, h_data, size * sizeof(float), cudaMemcpyHostToDevice);
- int threads = 256;
- int blocks = (size + threads - 1) / threads;
- computeKernelFloat<<<blocks, threads>>>(d_data, size);
- cudaDeviceSynchronize(); // Ensure kernel is finished
- cudaMemcpy(h_data, d_data, size * sizeof(float), cudaMemcpyDeviceToHost);
- std::cout << "Sample result: " << h_data[0] << ", " << h_data[1] << "\n";
- cudaFree(d_data);
- delete[] h_data;
- }
- };
- // Explicit instantiation
- template class MyTemplate<float>;
- // ========================= main.cpp ==========================
- #include "my_template.h"
- int main() {
- MyTemplate<int> obj_int;
- obj_int.compute(); // CPU fallback
- MyTemplate<float> obj_float;
- obj_float.compute(); // CUDA-accelerated
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement