Untitled

// ======================= my_template.h =======================
#ifndef MY_TEMPLATE_H
#define MY_TEMPLATE_H

#include <iostream>

// General template class (non-CUDA, default implementation)
template<typename T>
class MyTemplate {
public:
    void compute() {
        std::cout << "Default CPU compute for generic type\n";
        // Default implementation (e.g., CPU-only)
    }
};

// Forward declaration of specialization for float
template<>
class MyTemplate<float>;

#endif // MY_TEMPLATE_H

// =================== my_template_float.cu ====================

#include <iostream>
#include "my_template.h"
#include <cuda_runtime.h>

// Simple CUDA kernel for demonstration
__global__ void computeKernelFloat(float* data, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
        data[idx] = data[idx] * 2.0f;  // Example operation
    }
}

// Specialization of MyTemplate for float using CUDA
template<>
class MyTemplate<float> {
public:
    void compute() {
        std::cout << "CUDA-accelerated compute for float\n";

        const int size = 1024;
        float* h_data = new float[size];
        for (int i = 0; i < size; ++i) h_data[i] = static_cast<float>(i);

        float* d_data;
        cudaMalloc(&d_data, size * sizeof(float));
        cudaMemcpy(d_data, h_data, size * sizeof(float), cudaMemcpyHostToDevice);

        int threads = 256;
        int blocks = (size + threads - 1) / threads;
        computeKernelFloat<<<blocks, threads>>>(d_data, size);
        cudaDeviceSynchronize();  // Ensure kernel is finished

        cudaMemcpy(h_data, d_data, size * sizeof(float), cudaMemcpyDeviceToHost);

        std::cout << "Sample result: " << h_data[0] << ", " << h_data[1] << "\n";

        cudaFree(d_data);
        delete[] h_data;
    }
};

// Explicit instantiation
template class MyTemplate<float>;

// ========================= main.cpp ==========================

#include "my_template.h"

int main() {
    MyTemplate<int> obj_int;
    obj_int.compute();  // CPU fallback

    MyTemplate<float> obj_float;
    obj_float.compute();  // CUDA-accelerated

    return 0;
}