Untitled

$ cat t3.c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define DSIZE 1000
#define TOL 0.01f
#define DAT 2.0f
int main(){

  float temp;
  int i_v, ntv, i_el, len_tv;
  float *tva, *tv_sq, *tv_sq_cpu;
  tva   = (float *)malloc(DSIZE*DSIZE*sizeof(float));
  tv_sq = (float *)malloc(DSIZE*sizeof(float));
  tv_sq_cpu = (float *)malloc(DSIZE*sizeof(float));
  if ((tv_sq == 0) || (tva == 0)) {printf("malloc fail\n"); return 1;}
  for (i_v = 0; i_v < ntv; i_v++)
    for (i_el = 0; i_el < len_tv; i_el++)
      tva[(i_v*len_tv) + i_el] = DAT;
  len_tv = DSIZE;
  ntv = DSIZE;

  for( i_v = 0; i_v < ntv; i_v++ )
  {
    temp = 0;

    for( i_el = 0; i_el < len_tv; i_el++ )
        temp += pow( tva[i_v*len_tv + i_el], (float)2.0 );

    tv_sq_cpu[i_v]=temp;
  }

  #pragma acc data copyin(tva[:(len_tv*ntv)]) copyout(tv_sq[:ntv]) create(temp)
  {
    #pragma acc kernels loop independent
        for( i_v = 0; i_v < ntv; i_v++ )
        {
            temp = 0;

            #pragma acc loop independent gang vector reduction(+:temp)
                for( i_el = 0; i_el < len_tv; i_el++ )
                    temp += pow( tva[i_v*len_tv + i_el], (float)2.0 );

            tv_sq[i_v]=temp;
        }
  }

  for (i_v = 0; i_v < ntv; i_v++)
    if (abs(tv_sq[i_v] - tv_sq_cpu[i_v]) > TOL) {printf("mismatch at idx: %d cpu: %f gpu: %f\n", i_v, tv_sq_cpu[i_v], tv_sq[i_v]); return 1;}
  printf("Success\n");
  return 0;

}

$ pgcc -O3 -acc -ta=nvidia,cc20,cuda5.0 -Minfo=accel t3.c -o t3
main:
     32, Generating create(temp)
         Generating copyout(tv_sq[0:ntv])
         Generating copyin(tva[0:ntv*len_tv])
     34, Generating present_or_copyout(tv_sq[0:ntv])
         Generating present_or_copyin(tva[0:ntv*len_tv])
         Generating NVIDIA code
         Generating compute capability 2.0 binary
     35, Loop is parallelizable
         Accelerator kernel generated
         35, #pragma acc loop gang /* blockIdx.x */
         40, #pragma acc loop vector(128) /* threadIdx.x */
         Loop is parallelizable
$ ./t3
Success
$ pgcc -V

pgcc 13.10-0 64-bit target on x86-64 Linux -tp nehalem
The Portland Group - PGI Compilers and Tools
Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
$