OpenCL

/*
 * BackPropTrainer.java
 *
 * Copyright (C) August Mayer, 2001-2004. All rights reserved.
 * Please consult the Boone LICENSE file for additional rights granted to you.
 *
 * Created on 19. November 2002, 15:06
 */

package boone.training;

import boone.*;
import boone.io.*;
import boone.util.VarArray;
import java.nio.ByteBuffer;

import static org.jocl.CL.*;
import org.jocl.*;

/**
 *  Backpropagation trainer, with momentum (optional).
 *
 *  @author August Mayer
 *  @version $Id: BackpropTrainer.java 2028 2010-05-05 08:27:07Z amayer $
 */
public
class BackpropTrainer extends Trainer
{

    /** minimum relevant error, default 0 . */
    protected double minError = 0;

    /** the momentum. 0 by default, so momentum is turned off by standard. */
    public double momentum = 0.0;

    protected cl_program program_iterate, program_errorcalc, program_backprop = null;
    protected cl_kernel kernel_iterate, kernel_errorcalc, kernel_backprop = null;
    protected cl_context clContext = null;
    protected cl_command_queue clCommandQueue = null;

    protected cl_mem memObjects[] = null;

    //Number of elements = number of layers
    //Element value = number of nodes in that layer
    protected int GPUTickList[] = null;

    //Arrays for input and target patterns to network
    protected double GPUInputPatterns[] = null;
    protected double GPUTargetPatterns[] = null;
    protected int GPUInputPatternSize[] = null;
    protected int GPUTargetPatternSize[] = null;
    //Weights for each link
    protected double GPUWeights[] = null;
    protected double GPULastWeightChange[] = null;
    //Number of inputs to each neuron.
    protected int GPUNumInputs[] = null;
    //Activation functions
    protected int GPUActFuncs[] = null;
    //Bias
    protected double GPUBias[] = null;
    protected int GPUUsingBias[] = null;
    protected double GPULastBiasChange[] = null;
    //Is input neuron / External input trigger
    protected int GPUIsInputNeuron[] = null;
    //Is output neuron / error gen trigger?
    protected int GPUIsOutputNeuron[] = null;
    //Current Pattern
    protected int GPUCurrentPattern[] = {0};
    //Current index offset
    protected int GPUIndexOffset[] = {0};
    //Inputs index array
    protected int GPUInputs[] = null;
    //Error
    //protected double GPUError[] = null;
    //Learning rate
    protected double GPULearningRate[] = {this.learnRate};
    //Momentum
    protected double GPUMomentum[] = {this.momentum};
    //Minimum error
    protected double GPUMinError[] = {this.minError};


    //Maximum number of doubles associated with one neuron, used for stride
    //while indexing for example, inputs to a neuron. i * maxnumfloats
    protected int GPUMaxNumFloats[] = {0};

    protected ByteBuffer GPUMapCurrentPattern = null;
    protected ByteBuffer GPUMapIndexOffset = null;


    public static final String strKernIterate =
        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable                              \n" +
        "__kernel void Neuron(__global const double *inputPatterns,                 \n" +
        "                       __global double *weights,                           \n" +
        "                       __global const int *numInputs,                      \n" +
        "                       __global const int *activation,                     \n" +
        "                       __global const double *bias,                        \n" +
        "                       __global const int *usingBias,                      \n" +
        "                       __global double *values,                            \n" +
        "                       __global const int *maxNumFloats,                   \n" +
        "                       __global const int *patternIndex,                   \n" +
        "                       __global const int *inputPatternSize,               \n" +
        "                       __global const int *indexOffset,                    \n" +
        "                       __global const int *isInputNeuron,                  \n" +
        "                       __global const int *inputs)                         \n" +
        "{                                                                          \n" +
        "   int gid = get_global_id(0);                                             \n" +
        "   double sum = 0.0;                                                       \n" +
        //"   for(int i = 0; i < numInputs[gid+indexOffset[0]]; i++)                  \n" +
        "   for(int i = 0; i < maxNumFloats[0]; i++)                                \n" +
        "   {                                                                       \n" +
        "       if(i < numInputs[gid+indexOffset[0]])                               \n" +
        "          sum += values[inputs[(gid+indexOffset[0]) * maxNumFloats[0] + i]] * \n" +
        "               weights[(gid+indexOffset[0]) * maxNumFloats[0] + i];        \n" +
        "   }                                                                       \n" +
        "   if(usingBias[gid+indexOffset[0]])                                       \n" +
        "       sum += bias[gid+indexOffset[0]];                                    \n" +
        "   if(isInputNeuron[gid+indexOffset[0]])                                   \n" +
        "       sum += inputPatterns[gid+indexOffset[0]+(patternIndex[0] * inputPatternSize[0])];   \n" +
        "   if(activation[gid+indexOffset[0]] == 1)                                 \n" +
        "       sum = 1.0 / (1.0 + exp(-sum));                                      \n" +
        "   values[gid + indexOffset[0]] = sum;                                     \n" +
        "}                                                                          \n"
        ;
    public static final String strKernErrorCalc =
        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable                              \n" +
        "__kernel void Neuron(__global const double *targetPatterns,                \n" +
        "                       __global double *values,                            \n" +
        "                       __global const double *minError,                    \n" +
        "                       __global const int *indexOffset,                    \n" +
        "                       __global const int *patternIndex,                   \n" +
        "                       __global const int *targetPatternSize,              \n" +
        "                       __global double *error)                             \n" +
        "{                                                                          \n" +
        "   int gid = get_global_id(0);                                             \n" +
        "   double errorCalc =                                                      \n" +
        "       targetPatterns[(targetPatternSize[0]*patternIndex[0])+gid+indexOffset[0]]\n" +
        "       - values[gid + indexOffset[0]];                                      \n" +
        "   if(fabs(errorCalc) < minError[0])                                        \n" +
        "       error[gid + indexOffset[0]] = 0;                                    \n" +
        "   else error[gid + indexOffset[0]] = errorCalc;                           \n" +
        "}                                                                          \n"
        ;
    public static final String strKernBackProp =
        "#pragma OPENCL EXTENSION cl_khr_fp64 : enable                              \n" +
        "__kernel void Neuron(__global const int *usingBias,                        \n" +
        "                       __global const int *indexOffset,                    \n" +
        "                       __global const double *learningRate,                \n" +
        "                       __global double *error,                             \n" +
        "                       __global double *values,                            \n" +
        "                       __global const double *momentum,                    \n" +
        "                       __global double *lastBiasChange,                    \n" +
        "                       __global double *lastWeightChange,                  \n" +
        "                       __global const double *numInputs,                   \n" +
        "                       __global double *weights,                           \n" +
        "                       __global const int *maxNumFloats,                \n" +
        "                       __global const int *inputs,                         \n" +
        "                       __global double *bias)                        \n" +
        "{                                                                          \n" +
        "   int gid = get_global_id(0);                                             \n" +
        "   if(usingBias[gid + indexOffset[0]])                                     \n" +
        "   {                                                                       \n" +
        "       double biasChange = learningRate[0]                                 \n" +
        "           * error[gid + indexOffset[0]]                                   \n" +
        "           * (values[gid+indexOffset[0]] * (1.0 - values[gid+indexOffset[0]]))  \n" +
        "           + momentum[0] * lastBiasChange[gid+indexOffset[0]];             \n" +
        "       bias[gid + indexOffset[0]] += biasChange;                           \n" +
        "       lastBiasChange[gid + indexOffset[0]] = biasChange;                  \n" +
        "   }                                                                       \n" +
        "                                                                           \n" +
        "   for(int i = 0; i < numInputs[gid + indexOffset[0]]; i++)                \n" +
        "   {                                                                       \n" +
        "       double ces = error[gid+indexOffset[0]] * (values[gid+indexOffset[0]]\n" +
        "           * (1.0 - values[gid+indexOffset[0]]));                          \n" +
        "       double wes = ces                                                    \n" +
        "           * weights[((gid+indexOffset[0]) * maxNumFloats[0]) + i];        \n" +
        "                                                                           \n" +
        "       error[inputs[((gid+indexOffset[0]) * maxNumFloats[0]) + i]] += wes; \n" +
        "                                                                           \n" +
        "       double wc = learningRate[0] * ces                                   \n" +
        "           * values[inputs[((gid+indexOffset[0]) * maxNumFloats[0]) +i]] + momentum[0]\n" +
        "           * lastWeightChange[((gid+indexOffset[0]) * maxNumFloats[0]) +i];\n" +
        "       weights[((gid+indexOffset[0]) * maxNumFloats[0]) + i] += wc;        \n" +
        "       lastWeightChange[((gid+indexOffset[0]) * maxNumFloats[0]) + i] = wc;\n" +
        "   }                                                                       \n" +
        "}                                                                          \n"
        ;

    /*
     * Takes pattern set and number of loops, converts patterns to GPU
     * arrays, works out GPU tick list, sets initial weights, and trains until
     * each input pattern has been done n times. At this point, the weights and
     * error are read back.
     */
    public void GPUTrain(PatternSet pSet, int eachTimes)
    {
        setTraining(true);

        if(GPUTickList == null || GPUTickList != net.generateGPUTickList())
        {
            //Setup tick list
            GPUTickList = net.generateGPUTickList();
            int biggestLoop = 0;
            for(int i = 0; i < GPUTickList.length; i++)
                biggestLoop = Math.max(biggestLoop, GPUTickList[i]);
        }
        //Get number of input/target patterns
        int numPatterns = Math.min(pSet.inputPatterns.size, pSet.targetPatterns.size);

        if(GPUNumInputs == null || GPUNumInputs.length < net.getNeuronCount())
        {
            //Set up stuff that only depends on number of nodes
            //Number of inputs to each node
            GPUNumInputs = new int[net.getNeuronCount()];
            //Activation functions for each node
            GPUActFuncs = new int[net.getNeuronCount()];
            //Bias for each node
            GPUBias = new double[net.getNeuronCount()];
            GPUUsingBias = new int[net.getNeuronCount()];
            GPULastBiasChange = new double[net.getNeuronCount()];
            //Is input/output for each node
            GPUIsInputNeuron = new int[net.getNeuronCount()];
            GPUIsOutputNeuron = new int[net.getNeuronCount()];
        }

        //Setup 2D arrays
        //Set up input/target arrays based on number of patterns.
        GPUInputPatterns = new double[pSet.getInputPatternSize() * numPatterns];
        GPUTargetPatterns = new double[pSet.getTargetPatternSize() * numPatterns];

        GPUInputPatternSize = new int[] {pSet.getInputPatternSize()};
        GPUTargetPatternSize = new int[] {pSet.getTargetPatternSize()};
        //Populate arrays with pattern data
        for(int i = 0; i < numPatterns; i++)
        {
            java.lang.System.arraycopy(pSet.inputPatterns.get(i), 0,
                    GPUInputPatterns, i * net.getInputNeuronCount(),
                    net.getInputNeuronCount());
            java.lang.System.arraycopy(pSet.targetPatterns.get(i), 0,
                    GPUTargetPatterns, i * net.getOutputNeuronCount(),
                    net.getOutputNeuronCount());
        }

        //Set up 1D arrays for each node, and work out MaxNumFloats
        GPUMaxNumFloats[0] = 0;
        for(int i = 0; i < net.getNeuronCount(); i++)
        {
            Neuron neuron = net.getNeuron(i);
            //Set fields to vals for each neuron
            GPUNumInputs[i] = neuron.getInputLinkCount();
            GPUActFuncs[i] = neuron.getActivationFn().getClass() ==
                                Function.Sigmoid.class ? 1 : 0;
            GPUBias[i] = neuron.getBias();
            GPUUsingBias[i] = neuron.isUsingBias() ? 1 : 0;
            GPUIsInputNeuron[i] = neuron.isInputNeuron() ? 1 : 0;
            GPUIsOutputNeuron[i] = neuron.isOutputNeuron() ? 1 : 0;

            //Check for GPUMaxNumFloats
            GPUMaxNumFloats[0] = Math.max(GPUMaxNumFloats[0], GPUNumInputs[i]);
        }

        //Having worked out MaxNumFloats, work out 2D arrays
        GPUWeights = new double[net.getNeuronCount() * GPUMaxNumFloats[0]];
        GPULastWeightChange = new double[net.getNeuronCount() * GPUMaxNumFloats[0]];
        GPUInputs = new int[net.getNeuronCount() * GPUMaxNumFloats[0]];

        for(int i = 0; i < net.getNeuronCount(); i++)
        {
            Neuron neuron = net.getNeuron(i);
            for(int j = 0; j < GPUMaxNumFloats[0]; j++)
            {
                if(j < neuron.getInputLinkCount())
                {
                    Link link = neuron.getInputLink(j);
                    GPUWeights[(i * GPUMaxNumFloats[0]) + j] = link.getWeight();
                    GPUInputs[(i * GPUMaxNumFloats[0]) + j] = net.getNeuronIndex(link.getSource());
                }
                else
                {
                    GPUWeights[(i * GPUMaxNumFloats[0]) + j] = 0;
                    GPUInputs[(i * GPUMaxNumFloats[0]) + j] = 0;
                }
            }
        }

        ///////////
        //GPU STUFF

        //Setup opencl context and command queue if not already done.
        if(clCommandQueue == null || clContext == null)
            SetupCL();

        //Setup Buffers and programs
        if(memObjects == null)
        {
            memObjects = new cl_mem[25];

            //Calculate
            //Input Patterns
            memObjects[0] = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                    Sizeof.cl_double * GPUInputPatterns.length,
                    Pointer.to(GPUInputPatterns), null);
            //Target Patterns
            memObjects[1] = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                    Sizeof.cl_double * GPUTargetPatterns.length,
                    Pointer.to(GPUTargetPatterns), null);
            //Initial Weights
            memObjects[2] = clCreateBuffer(clContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                    Sizeof.cl_double * GPUWeights.length,
                    Pointer.to(GPUWeights), null);
            //Number of Inputs
            memObjects[3] = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                    Sizeof.cl_int * GPUNumInputs.length,
                    Pointer.to(GPUNumInputs), null);
            //Activation functions
            memObjects[4] = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                    Sizeof.cl_int * GPUActFuncs.length,
                    Pointer.to(GPUActFuncs), null);
            //Bias
            memObjects[5] = clCreateBuffer(clContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                    Sizeof.cl_double * GPUBias.length,
                    Pointer.to(GPUBias), null);
            //Values
            memObjects[6] = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
                    Sizeof.cl_double * net.getNeuronCount(), null, null);
            //Max num floats
            memObjects[7] = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                    Sizeof.cl_int,
                    Pointer.to(GPUMaxNumFloats), null);
            //Current Pattern - Device
            memObjects[8] = clCreateBuffer(clContext, CL_MEM_READ_ONLY,
                    Sizeof.cl_int, null, null);
            //Current Pattern - PINNED
            memObjects[9] = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
                    Sizeof.cl_int, null, null);
            //Input Pattern Size
            memObjects[10] = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                    Sizeof.cl_int, Pointer.to(GPUInputPatternSize), null);
            //Target Pattern Size
            memObjects[11] = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                    Sizeof.cl_int, Pointer.to(GPUTargetPatternSize), null);
            //Index Offset - Device
            memObjects[12] = clCreateBuffer(clContext, CL_MEM_READ_ONLY,
                    Sizeof.cl_int, null, null);
            //Index Offset - PINNED
            memObjects[13] = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
                    Sizeof.cl_int, null, null);
            //Input Neuron?
            memObjects[14] = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                    Sizeof.cl_int * GPUIsInputNeuron.length, Pointer.to(GPUIsInputNeuron), null);
            //Output Neuron?
            memObjects[15] = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                    Sizeof.cl_int * GPUIsOutputNeuron.length, Pointer.to(GPUIsOutputNeuron), null);
            //Inputs
            memObjects[16] = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                    Sizeof.cl_int * GPUInputs.length, Pointer.to(GPUInputs), null);
            //Using bias
            memObjects[18] = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                    Sizeof.cl_int * GPUUsingBias.length, Pointer.to(GPUUsingBias), null);

            GPUMapCurrentPattern = clEnqueueMapBuffer(clCommandQueue, memObjects[9], CL_FALSE,
                    CL_MAP_WRITE, 0, Sizeof.cl_int, 0, null, null, null);
            GPUMapIndexOffset = clEnqueueMapBuffer(clCommandQueue, memObjects[13], CL_FALSE,
                    CL_MAP_WRITE, 0, Sizeof.cl_int, 0, null, null, null);

            ///////////
            //BackProp
            //Error
            memObjects[17] = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
                    Sizeof.cl_double * GPUUsingBias.length, null, null);
            //MinError
            memObjects[19] = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                    Sizeof.cl_double, Pointer.to(GPUMinError), null);
            //Learning Rate
            memObjects[20] = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                    Sizeof.cl_double, Pointer.to(GPULearningRate), null);
            //Last Bias Change
            memObjects[21] = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
                    Sizeof.cl_double * GPUUsingBias.length, null, null);
            //Momentum
            memObjects[22] = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                    Sizeof.cl_double, Pointer.to(GPUMomentum), null);
            //Last Weight Change
            memObjects[23] = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
                    Sizeof.cl_double * GPUWeights.length, null, null);

        }


        if(program_iterate != null)
            clReleaseProgram(program_iterate);
        if(program_errorcalc != null)
            clReleaseProgram(program_errorcalc);
        if(program_backprop != null)
            clReleaseProgram(program_backprop);
        if(kernel_iterate != null)
            clReleaseKernel(kernel_iterate);
        if(kernel_errorcalc != null)
            clReleaseKernel(kernel_errorcalc);
        if(kernel_backprop != null)
            clReleaseKernel(kernel_backprop);

        program_iterate = clCreateProgramWithSource(clContext, 1,
                new String[] { strKernIterate }, null, null);
        clBuildProgram(program_iterate, 0, null, null, null, null);
        program_errorcalc = clCreateProgramWithSource(clContext, 1,
                new String[] { strKernErrorCalc }, null, null);
        clBuildProgram(program_errorcalc, 0, null, null, null, null);
        program_backprop = clCreateProgramWithSource(clContext, 1,
                new String[] { strKernBackProp }, null, null);
        clBuildProgram(program_backprop, 0, null, null, null, null);

        kernel_iterate = clCreateKernel(program_iterate, "Neuron", null);
        kernel_errorcalc = clCreateKernel(program_errorcalc, "Neuron", null);
        kernel_backprop = clCreateKernel(program_backprop, "Neuron", null);

        //Set kernel args
        //Input Patterns
        clSetKernelArg(kernel_iterate, 0, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[0]));
        //Target patterns
        //clSetKernelArg(kern_calc, 1, Long.valueOf(Sizeof.cl_mem),
                //Pointer.to(memObjects[1]));
        //Initial Weights
        clSetKernelArg(kernel_iterate, 1, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[2]));
        //Number of inputs
        clSetKernelArg(kernel_iterate, 2, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[3]));
        //Activation functions
        clSetKernelArg(kernel_iterate, 3, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[4]));
        //Bias
        clSetKernelArg(kernel_iterate, 4, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[5]));
        //Using Bias
        clSetKernelArg(kernel_iterate, 5, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[18]));
        //Values
        clSetKernelArg(kernel_iterate, 6, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[6]));
        //Max Num Floats
        clSetKernelArg(kernel_iterate, 7, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[7]));
        //Current Pattern
        clSetKernelArg(kernel_iterate, 8, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[8]));
        //InputPatternSize
        clSetKernelArg(kernel_iterate, 9, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[10]));
        //TargetPatternSize
        //clSetKernelArg(kern_calc, 11, Long.valueOf(Sizeof.cl_mem),
                //Pointer.to(memObjects[11]));
        //Index Offset
        clSetKernelArg(kernel_iterate, 10, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[12]));
        //Is input neuron
        clSetKernelArg(kernel_iterate, 11, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[14]));
        //Is output neuron
        //clSetKernelArg(kern_calc, 14, Long.valueOf(Sizeof.cl_mem),
                //Pointer.to(memObjects[15]));
        //Inputs
        clSetKernelArg(kernel_iterate, 12, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[16]));

        //////////
        //Error calc kernel arguments
        //Target patterns
        clSetKernelArg(kernel_errorcalc, 0, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[1]));
        //Values
        clSetKernelArg(kernel_errorcalc, 1, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[6]));
        //MinError
        clSetKernelArg(kernel_errorcalc, 2, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[19]));
        //Index Offset
        clSetKernelArg(kernel_errorcalc, 3, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[12]));
        //Pattern Index
        clSetKernelArg(kernel_errorcalc, 4, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[8]));
        //Target Pattern Size
        clSetKernelArg(kernel_errorcalc, 5, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[11]));
        //Error
        clSetKernelArg(kernel_errorcalc, 6, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[17]));

        //////////
        //BackProp
        //Using Bias
        clSetKernelArg(kernel_backprop, 0, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[18]));
        //indexOffset
        clSetKernelArg(kernel_backprop, 1, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[12]));
        //Learning Rate
        clSetKernelArg(kernel_backprop, 2, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[20]));
        //Error
        clSetKernelArg(kernel_backprop, 3, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[17]));
        //Values
        clSetKernelArg(kernel_backprop, 4, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[6]));
        //Momentum
        clSetKernelArg(kernel_backprop, 5, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[22]));
        //lastBiasChange
        clSetKernelArg(kernel_backprop, 6, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[21]));
        //lastWeightChange
        clSetKernelArg(kernel_backprop, 7, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[23]));
        //Num inputs
        clSetKernelArg(kernel_backprop, 8, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[3]));
        //Weights
        clSetKernelArg(kernel_backprop, 9, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[2]));
        //Max Num Floats
        clSetKernelArg(kernel_backprop, 10, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[7]));
        //Inputs
        clSetKernelArg(kernel_backprop, 11, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[16]));
        //Bias
        clSetKernelArg(kernel_backprop, 12, Long.valueOf(Sizeof.cl_mem),
                Pointer.to(memObjects[5]));

        clFinish(clCommandQueue);

        long global_work_offset[] = new long[]{0};
        long global_work_size[] = new long[1];
        long local_work_size[] = new long[]{1};

        int ret = 0;

        //Loop over each pattern n times
        for(int i = 0; i < eachTimes; i++)
            for(int j = 0; j < numPatterns; j++)
            {
                //Set current pattern index
                GPUMapCurrentPattern.asIntBuffer().put(0, j);
                ret = clEnqueueWriteBuffer(clCommandQueue, memObjects[8], CL_TRUE, 0,
                        Sizeof.cl_int, Pointer.to(GPUMapCurrentPattern), 0, null, null);

                //Calc
                for(int k = 0; k < GPUTickList.length; k++)
                {
                    clFlush(clCommandQueue);
                    clFinish(clCommandQueue);
                    //If input nodes
                    if(k == 0)
                        //Set index offset to 0
                        GPUMapIndexOffset.asIntBuffer().put(0, 0);
                    else
                        //Update index offset
                        GPUMapIndexOffset.asIntBuffer().put(0,
                            GPUMapIndexOffset.asIntBuffer().get(0) + GPUTickList[k-1]);
                    //Write index offset to GPU buffer
                    ret = clEnqueueWriteBuffer(clCommandQueue, memObjects[12], CL_TRUE, 0,
                            Sizeof.cl_int, Pointer.to(GPUMapIndexOffset.position(0)), 0, null, null);

                    //Set work size (width of layer)
                    global_work_size[0] = GPUTickList[k];
                    ret = clEnqueueNDRangeKernel(clCommandQueue, kernel_iterate, 1,
                        global_work_offset, global_work_size, local_work_size,
                        0, null, null);
                    //clFinish(clCommandQueue);

                    //clEnqueueReadBuffer(clCommandQueue, memObjects[6], CL_TRUE, 0,
                    //        Sizeof.cl_double * net.getNeuronCount(),
                    //        Pointer.to(GPUBias), 0, null, null);
                }
                //Error calc
                clFlush(clCommandQueue);
                clFinish(clCommandQueue);
                ret = clEnqueueNDRangeKernel(clCommandQueue, kernel_errorcalc, 1,
                    global_work_offset, global_work_size, local_work_size,
                    0, null, null);

                //Back prop
                for(int k = GPUTickList.length -1; k >= 0; k--)
                {
                    global_work_size[0] = GPUTickList[k];
                    //Do backprop
                    clFlush(clCommandQueue);
                    clFinish(clCommandQueue);
                    ret = clEnqueueNDRangeKernel(clCommandQueue, kernel_backprop, 1,
                            global_work_offset, global_work_size, local_work_size,
                            0, null, null);
                    //If there is another level to go, update offset
                    if(k > 0)
                        GPUMapIndexOffset.asIntBuffer().put(0,
                            GPUMapIndexOffset.asIntBuffer().get(0) - GPUTickList[k-1]);
                }

            }

        //Read Weights
        clFlush(clCommandQueue);
        clFinish(clCommandQueue);
        ret = clEnqueueReadBuffer(clCommandQueue, memObjects[2], CL_TRUE, 0,
                Sizeof.cl_double * GPUWeights.length, Pointer.to(GPUWeights),
                0, null, null);
        //Read Bias
        ret = clEnqueueReadBuffer(clCommandQueue, memObjects[5], CL_TRUE, 0,
                Sizeof.cl_double * GPUBias.length, Pointer.to(GPUBias),
                0, null, null);

        for(int i = 0; i < net.getNeuronCount(); i++)
        {
            Neuron neuron = net.getNeuron(i);
            neuron.setBias(GPUBias[i]);
            for(int j = 0; j < neuron.getInputLinkCount(); j++)
            {
                Link link = neuron.getInputLink(j);
                link.setWeight(GPUWeights[(i * GPUMaxNumFloats[0]) + j]);
            }
        }
        setTraining(false);
    }

    protected int SetupCL()
    {
        try
        {
            final int platformIndex = 0;
            final long deviceType = CL_DEVICE_TYPE_GPU;
            final int deviceIndex = 0;

            CL.setExceptionsEnabled(true);

            //Obtain number of platforms
            int numPlatformsArray[] = new int[1];
            clGetPlatformIDs(0, null, numPlatformsArray);
            int numPlatforms = numPlatformsArray[0];

            //Obtain a platform ID
            cl_platform_id platforms[] = new cl_platform_id[numPlatforms];
            clGetPlatformIDs(platforms.length, platforms, null);
            cl_platform_id platform = platforms[platformIndex];

            //Initialize the context properties
            cl_context_properties contextProperties = new cl_context_properties();
            contextProperties.addProperty(CL_CONTEXT_PLATFORM, platform);

            //Obtain the number of devices for the platform
            int numDevicesArray[] = new int[1];
            clGetDeviceIDs(platform, deviceType, 0, null, numDevicesArray);
            int numDevices = numDevicesArray[0];

            //Obtain a device ID
            cl_device_id devices[] = new cl_device_id[numDevices];
            clGetDeviceIDs(platform, deviceType, numDevices, devices, null);
            cl_device_id device = devices[deviceIndex];

            //Create a context for the selected device
            clContext = clCreateContext(contextProperties, 1, new cl_device_id[]{device},
                    null, null, null);

            //Create a command-queue for the selected device
            clCommandQueue = clCreateCommandQueue(clContext, device, 0, null);
        } catch (CLException e) {

        }
        return 0;
    }
    //
    // creation & persistance

    /**
     *  Create a new Backprop trainer.
     *  Uses a {@link boone.TrainingSignalGenerator.SquareError} .
     */
    public
    BackpropTrainer()
    {
        setTrainingSignalGenerator(new TrainingSignalGenerator.SquareError());
    }

    /** create standard NeuronTrainers, LinkTrainers. */
    public
    PartTrainer createPartTrainer(BrainPart part)
    {
        if(part instanceof Neuron)
            return new BackpropNeuronTrainer((Neuron) part, this);
        if(part instanceof Link)
            return new BackpropLinkTrainer((Link) part, this);
        return null;
    }


    //
    // training

    /**
     *  supervised training. an input and an output pattern are given. <p>
     *
     *  Backpropagation Training: <ul>
     *      <li> set the training flag
     *      <li> test the network
     *      <li> calculate the error of the output neurons, train them
     *      <li> propagate backwards through the network, using the reversed tickList.
     *      <li> unset the training flag.
     *      <li> return the error. </ul>
     *
     *  This method uses a {@link boone.TrainingSignalGenerator} to calculate the error signals.
     *  Please see {@link boone.TrainingSignalGenerator.SquareError} for details on what the
     *  TrainingSignalGenerator instance is supposed to do.
     */
    public
    void trainTurn(double[] input, double[] target)
    {
        setTraining(true);

        // initialize all the neurons
        for(int i=net.getNeuronCount()-1; i>=0; i--)
            ((BackpropNeuronTrainer) net.getNeuron(i).getPartTrainer()).beginTurn();

        // test the network, calculating the error.
        net.setInput(input);
        net.innervate();

        // calculate neuron error, error sum for output neurons
        trainingSignalGenerator.calculateSignal(net, input, target, minError);

        // train: propagate backwards
        VarArray<Neuron> tlist = net.getTickList();
        for(int i=tlist.size-1; i>=0; i--)
        {
            PartTrainer partTrainer = ((Neuron) tlist.array[i]).partTrainer;
            if(partTrainer != null)
                partTrainer.train();
        }

        setTraining(false);
    }


    //
    // getters & setters


    /** @return Value of property minError. */
    public double getMinError() { return minError; }

    /** @param minError New value of property minError. */
    public void setMinError(double minError) { this.minError = minError; }

    /** @return Value of property momentum. */
    public double getMomentum() { return momentum; }

    /** @param momentum New value of property momentum. */
    public void setMomentum(double momentum) { this.momentum = momentum; }


    //
    // storing, loading

    /** Store the state into the IOElement. */
    public
    void store(IOElement node)
    {
        super.store(node);
        node.putAttribute("minError", minError);
        node.putAttribute("momentum", momentum);
    }

    /** Load the state from the IOElement. */
    public
    void load(IOElement node) throws IOElement.LoadException
    {
        super.load(node);
        minError = node.getDoubleAttribute("minError", minError);
        momentum = node.getDoubleAttribute("momentum", momentum);
    }

    //
    //
    // subclasses
    //

    /**
     * PartTrainer for Neurons
     * @author August Mayer
     */
    public static
    class BackpropNeuronTrainer
    extends PartTrainer
    {

        /** neuron error term */
        protected double errorSignal = 0.0;

        /** last bias weight change */
        protected double lastBiasChange = 0;

        //
        // construction, persistence

        /** create a new null trainer. Needed by persistence. */
        public BackpropNeuronTrainer() {}

        /** new NeuronTrainer. */
        public BackpropNeuronTrainer(Neuron neuron, BackpropTrainer trainer) { super(neuron, trainer); }


        //
        // Training

        /** reset lastBiasChange to 0 */
        public void resetTraining() { lastBiasChange = 0.0; }

        /** reset the error signal each turn */
        public void beginTurn() { errorSignal = 0.0; }

        /**
         *  train the neuron. <br>
         *  This is called in Neuron.run() as well as in the Trainer.
         */
        public
        void train()
        {
            Neuron neuron = (Neuron) part;

            // train the bias (if the neuron uses bias)
            if(neuron.isUsingBias())
            {
                BackpropTrainer bptrainer = (BackpropTrainer) trainer;
                double biasChange = bptrainer.learnRate
                    * errorSignal
                    * neuron.getActivationFn().mapDerivative(neuron.getInput())
                    + bptrainer.momentum * lastBiasChange;
                neuron.addToBias(biasChange);
                lastBiasChange = biasChange;
            }

            // train the preceding links
            for(int i=neuron.getInputLinkCount()-1; i>=0; i--)
            {
                PartTrainer partTrainer = neuron.getInputLink(i).partTrainer;
                if(partTrainer != null)
                    partTrainer.train();
            }
        }


        //
        // getters & setters

        /** return the current errorSignal */
        public double getErrorSignal() { return errorSignal; }

        /** set the error signal */
        public void setErrorSignal(double errorSignal) { this.errorSignal = errorSignal; }

        /** return the last bias change. */
        public double getLastBiasChange() { return lastBiasChange; }

        /**  set the last bias change */
        public void setLastBiasChange(double changeVal) { this.lastBiasChange = changeVal; }

        /** Store the state. */
        public
        void store(IOElement node)
        {
            node.putAttribute("errorSignal", errorSignal);
            node.putAttribute("lastBiasChange", lastBiasChange);
        }

        /** Load the state. */
        public
        void load(IOElement node)
        {
            errorSignal = node.getDoubleAttribute("errorSignal", errorSignal);
            lastBiasChange = node.getDoubleAttribute("lastBiasChange", lastBiasChange);
        }
    }


    /**
     *  PartTrainer for Links
     *  @author August Mayer
     */
    public static
    class BackpropLinkTrainer extends PartTrainer
    {

        /** last link weight change */
        protected double lastWeightChange = 0;

        /** create a new null object. Needed by persistence. */
        public BackpropLinkTrainer() {}

        /** Creates a new instance of BPLinkTrainer */
        public BackpropLinkTrainer(Link link, BackpropTrainer trainer) { super(link, trainer); }

        /** reset the lastWeightChange to 0 */
        public void resetTraining() { lastWeightChange = 0; }

        /**
         *  Train the link. <br>
         *  This is called in Neuron.run() as well as in the NetTrainer.
         */
        public
        void train()
        {
            Link link = (Link) part;
            BackpropTrainer bptrainer = (BackpropTrainer) trainer;
            Neuron src = link.getSource();
            BackpropNeuronTrainer srcTrainer = (BackpropNeuronTrainer) src.partTrainer;
            Neuron sink = link.getSink();
            BackpropNeuronTrainer sinkTrainer = (BackpropNeuronTrainer) sink.partTrainer;

            // get the sink error
            double completeErrorSignal = sinkTrainer.getErrorSignal()
                * sink.getActivationFn().mapDerivative(sink.getInput());

            double weightedErrorSignal = completeErrorSignal * link.getWeight();

            // propagate error to source neuron:
            srcTrainer.errorSignal += weightedErrorSignal;

            // modify the link weight
            double weightChange = bptrainer.learnRate
                * completeErrorSignal
                * src.getOutput()
                + bptrainer.momentum * lastWeightChange;
            link.addToWeight(weightChange);

            lastWeightChange = weightChange;
        }

        /** return the last link weight change. */
        public double getLastWeightChange() { return lastWeightChange; }

        /**  set the last link weight change */
        public void setLastWeightChange(double changeVal) { this.lastWeightChange = changeVal; }

        /** Store the state. */
        public
        void store(IOElement node)
        {
            node.putAttribute("lastWeightChange", lastWeightChange);
        }

        /** Load the state. */
        public
        void load(IOElement node)
        {
            lastWeightChange = node.getDoubleAttribute("lastWeightChange", lastWeightChange);
        }

    } // end LinkTrainer

} // end BackpropTrainer