haha

import numpy as np
from pylab import imshow, show
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

# Number of iterations for each point
max_iterations = 50

# Width and height of the image
width = 1024
height = 1024

# The real and complex parts of the coordinates
real_min = -2.0
real_max = 1.0
imag_min = -1.5
imag_max = 1.5

# Generate the coordinates and copy them to the device
x, y = np.mgrid[real_min:real_max:(width*1j), imag_min:imag_max:(height*1j)]
x_gpu = cuda.mem_alloc(x.nbytes)
y_gpu = cuda.mem_alloc(y.nbytes)
cuda.memcpy_htod(x_gpu, x)
cuda.memcpy_htod(y_gpu, y)

# Generate the function for the mandelbrot calculation on the device
mod = SourceModule("""
__global__ void mandelbrot(int max_iterations, float *x, float *y, int *output) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    float real = x[idx];
    float imag = y[idx];

    float real_step = (x[blockDim.x] - x[0]) / blockDim.x;
    float imag_step = (y[blockDim.y] - y[0]) / blockDim.y;

    float real2 = 0;
    float imag2 = 0;

    int iteration;

    for (iteration = 0; iteration < max_iterations; iteration++) {
        float real2_temp = real2 * real2;
        float imag2_temp = imag2 * imag2;

        if (real2_temp + imag2_temp > 4) {
            break;
        }

        imag2 = 2 * real * imag + imag2;
        real2 = real2_temp - imag2_temp + real;

        real += real_step;
        imag += imag_step;
    }

    output[idx] = iteration;
}
""")
mandelbrot = mod.get_function("mandelbrot")
output = np.zeros(width * height, dtype=np.int32)
output_gpu = cuda.mem_alloc(output.nbytes)
mandelbrot(np.int32(max_iterations), x_gpu, y_gpu, output_gpu, block=(width, height, 1), grid=(width/16, height/16))
cuda.memcpy_dtoh(output, output_gpu)
output = output.reshape((height, width))
imshow(output)
show()