Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /******************************************************************************
- *
- * Copyright (C) 2009 - 2014 Xilinx, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * Use of the Software is limited solely to applications:
- * (a) running on a Xilinx device, or
- * (b) that interact with a Xilinx device through a bus or interconnect.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * XILINX BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
- * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Except as contained in this notice, the name of the Xilinx shall not be used
- * in advertising or otherwise to promote the sale, use or other dealings in
- * this Software without prior written authorization from Xilinx.
- *
- ******************************************************************************/
- /*
- * helloworld.c: simple test application
- *
- * This application configures UART 16550 to baud rate 9600.
- * PS7 UART (Zynq) is not initialized by this application, since
- * bootrom/bsp configures it to baud rate 115200
- *
- * ------------------------------------------------
- * | UART TYPE BAUD RATE |
- * ------------------------------------------------
- * uartns550 9600
- * uartlite Configurable only in HW design
- * ps7_uart 115200 (configured by bootrom/bsp)
- */
- #include <stdio.h>
- #include <stdlib.h>
- #include "platform.h"
- #include "xil_printf.h"
- #include "xparameters.h"
- #include "xuartlite.h"
- #include "hw_spec.h"
- #define VTA_FETCH_ADDR XPAR_VTA_FETCH_0_S_AXI_CONTROL_BASEADDR
- #define VTA_LOAD_ADDR XPAR_VTA_LOAD_0_S_AXI_CONTROL_BASEADDR
- #define VTA_COMPUTE_ADDR XPAR_VTA_COMPUTE_0_S_AXI_CONTROL_BASEADDR
- #define VTA_STORE_ADDR XPAR_VTA_STORE_0_S_AXI_CONTROL_BASEADDR
- /*! \brief VTA configuration register start value */
- #define VTA_START 0x1
- /*! \brief VTA configuration register auto-restart value */
- #define VTA_AUTORESTART 0x81
- /*! \brief VTA configuration register done value */
- #define VTA_DONE 0x1
- typedef enum {false, true} bool;
- typedef uint32_t uop_T;
- typedef int8_t wgt_T;
- typedef int8_t inp_T;
- typedef int8_t out_T;
- typedef int32_t acc_T;
- void *VTAMapRegister(uint32_t addr) {
- return (void *)addr;
- }
- void VTAWriteMappedReg(void *p, uint32_t offset, uint32_t data) {
- *(uint32_t *)(((uint8_t *)p) + offset) = data;
- }
- uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset) {
- return *(uint32_t *)base_addr;
- }
- void vta(uint32_t insn_count, VTAGenericInsn *insns, VTAUop *uops, uint32_t *inputs, uint32_t *weights, uint32_t *biases, uint32_t *outputs) {
- // Get VTA handles
- void* vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR);
- void* vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR);
- void* vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR);
- void* vta_store_handle = VTAMapRegister(VTA_STORE_ADDR);
- VTAWriteMappedReg(vta_fetch_handle, VTA_FETCH_INSN_COUNT_OFFSET, insn_count);
- if (insns) VTAWriteMappedReg(vta_fetch_handle, VTA_FETCH_INSN_ADDR_OFFSET, insns);
- if (inputs) VTAWriteMappedReg(vta_load_handle, VTA_LOAD_INP_ADDR_OFFSET, inputs);
- if (weights) VTAWriteMappedReg(vta_load_handle, VTA_LOAD_WGT_ADDR_OFFSET, weights);
- if (uops) VTAWriteMappedReg(vta_compute_handle, VTA_COMPUTE_UOP_ADDR_OFFSET, uops);
- if (biases) VTAWriteMappedReg(vta_compute_handle, VTA_COMPUTE_BIAS_ADDR_OFFSET, biases);
- if (outputs) VTAWriteMappedReg(vta_store_handle, VTA_STORE_OUT_ADDR_OFFSET, outputs);
- // VTA start
- VTAWriteMappedReg(vta_fetch_handle, 0x0, 0x1);
- VTAWriteMappedReg(vta_load_handle, 0x0, 0x81);
- VTAWriteMappedReg(vta_compute_handle, 0x0, 0x81);
- VTAWriteMappedReg(vta_store_handle, 0x0, 0x81);
- int flag = 0, t = 0;
- for (t = 0; t < 10000000; ++t) {
- flag = VTAReadMappedReg(vta_compute_handle, VTA_COMPUTE_DONE_RD_OFFSET);
- if (flag & VTA_DONE) break;
- }
- if (t == 10000000) {
- xil_printf("\tWARNING: VTA TIMEOUT!!!!\n");
- } else {
- xil_printf("INFO - FPGA Finished!\n");
- }
- }
- const char* getOpcodeString(int opcode, bool use_imm) {
- // Returns string name
- if (opcode == VTA_ALU_OPCODE_MIN) {
- if (use_imm) {
- return "min imm";
- } else {
- return "min";
- }
- } else if (opcode == VTA_ALU_OPCODE_MAX) {
- if (use_imm) {
- return "max imm";
- } else {
- return "max";
- }
- } else if (opcode == VTA_ALU_OPCODE_ADD) {
- if (use_imm) {
- return "add imm";
- } else {
- return "add";
- }
- } else if (opcode == VTA_ALU_OPCODE_SHR) {
- return "shr";
- }
- // else if (opcode == VTA_ALU_OPCODE_MUL) {
- // return "mul";
- // }
- return "unknown op";
- }
- void assert(uint8_t o) {
- if (!o) {
- xil_printf("ASSERT FAILED!!\r\n");
- while (1);
- }
- }
- void * allocBuffer(size_t num_bytes) {
- return malloc(num_bytes);
- }
- VTAGenericInsn get1DLoadStoreInsn(int opcode, int type, int sram_offset, int dram_offset, int size,
- int pop_prev_dep, int pop_next_dep, int push_prev_dep, int push_next_dep) {
- // Converter
- union VTAInsn converter;
- // Memory instruction initialization
- VTAMemInsn insn = {};
- insn.opcode = opcode;
- insn.pop_prev_dep = pop_prev_dep;
- insn.pop_next_dep = pop_next_dep;
- insn.push_prev_dep = push_prev_dep;
- insn.push_next_dep = push_next_dep;
- insn.memory_type = type;
- insn.sram_base = sram_offset;
- insn.dram_base = dram_offset;
- insn.y_size = 1;
- insn.x_size = size;
- insn.x_stride = size;
- insn.y_pad_0 = 0;
- insn.y_pad_1 = 0;
- insn.x_pad_0 = 0;
- insn.x_pad_1 = 0;
- converter.mem = insn;
- return converter.generic;
- }
- VTAGenericInsn get2DLoadStoreInsn(int opcode, int type, int sram_offset, int dram_offset,
- int y_size, int x_size, int x_stride, int y_pad, int x_pad, int pop_prev_dep, int pop_next_dep,
- int push_prev_dep, int push_next_dep) {
- // Converter
- union VTAInsn converter;
- // Memory instruction initialization
- VTAMemInsn insn = {};
- insn.opcode = opcode;
- insn.pop_prev_dep = pop_prev_dep;
- insn.pop_next_dep = pop_next_dep;
- insn.push_prev_dep = push_prev_dep;
- insn.push_next_dep = push_next_dep;
- insn.memory_type = type;
- insn.sram_base = sram_offset;
- insn.dram_base = dram_offset;
- insn.y_size = y_size;
- insn.x_size = x_size;
- insn.x_stride = x_stride;
- insn.y_pad_0 = y_pad;
- insn.y_pad_1 = y_pad;
- insn.x_pad_0 = x_pad;
- insn.x_pad_1 = x_pad;
- converter.mem = insn;
- return converter.generic;
- }
- VTAGenericInsn getALUInsn(int opcode, int vector_size, bool use_imm, int imm, bool uop_compression,
- int pop_prev_dep, int pop_next_dep, int push_prev_dep, int push_next_dep) {
- // Converter
- union VTAInsn converter;
- // Memory instruction initialization
- VTAAluInsn insn = {};
- insn.opcode = VTA_OPCODE_ALU;
- insn.pop_prev_dep = pop_prev_dep;
- insn.pop_next_dep = pop_next_dep;
- insn.push_prev_dep = push_prev_dep;
- insn.push_next_dep = push_next_dep;
- insn.reset_reg = false;
- if (!uop_compression) {
- insn.uop_bgn = 0;
- insn.uop_end = vector_size;
- insn.iter_out = 1;
- insn.iter_in = 1;
- insn.dst_factor_out = 0;
- insn.src_factor_out = 0;
- insn.dst_factor_in = 0;
- insn.src_factor_in = 0;
- insn.alu_opcode = opcode;
- insn.use_imm = use_imm;
- insn.imm = imm;
- } else {
- insn.uop_bgn = 0;
- insn.uop_end = 1;
- insn.iter_out = 1;
- insn.iter_in = vector_size;
- insn.dst_factor_out = 0;
- insn.src_factor_out = 0;
- insn.dst_factor_in = 1;
- insn.src_factor_in = 1;
- insn.alu_opcode = opcode;
- insn.use_imm = use_imm;
- insn.imm = imm;
- }
- converter.alu = insn;
- return converter.generic;
- }
- VTAGenericInsn getFinishInsn(bool pop_prev, bool pop_next) {
- // Converter
- union VTAInsn converter;
- // GEMM instruction initialization
- VTAGemInsn insn;
- insn.opcode = VTA_OPCODE_FINISH;
- insn.pop_prev_dep = pop_prev;
- insn.pop_next_dep = pop_next;
- insn.push_prev_dep = 0;
- insn.push_next_dep = 0;
- insn.reset_reg = false;
- insn.uop_bgn = 0;
- insn.uop_end = 0;
- insn.iter_out = 0;
- insn.iter_in = 0;
- insn.dst_factor_out = 0;
- insn.src_factor_out = 0;
- insn.wgt_factor_out = 0;
- insn.dst_factor_in = 0;
- insn.src_factor_in = 0;
- insn.wgt_factor_in = 0;
- converter.gemm = insn;
- return converter.generic;
- }
- VTAUop * getMapALUUops(int vector_size, bool uop_compression) {
- // Derive the total uop size
- int uop_size = (uop_compression) ? 1 : vector_size;
- // Allocate buffer
- VTAUop *uop_buf = (VTAUop *)(malloc(sizeof(VTAUop) * uop_size));
- if (!uop_compression) {
- for (int i = 0; i < vector_size; i++) {
- uop_buf[i].dst_idx = i;
- uop_buf[i].src_idx = vector_size + i;
- }
- } else {
- uop_buf[0].dst_idx = 0;
- uop_buf[0].src_idx = vector_size;
- }
- return uop_buf;
- }
- uint32_t globalSeed;
- acc_T** alloc2dArray_accT(int rows, int cols) {
- acc_T**array = (acc_T **)(malloc(sizeof(acc_T *) * rows));
- for (int i = 0; i < rows; i++) {
- array[i] = (acc_T *)(malloc(sizeof(acc_T) * cols));
- }
- return array;
- }
- out_T** alloc2dArray_outT(int rows, int cols) {
- out_T**array = (out_T **)(malloc(sizeof(out_T *) * rows));
- for (int i = 0; i < rows; i++) {
- array[i] = (out_T *)(malloc(sizeof(out_T) * cols));
- }
- return array;
- }
- void packBuffer_uint32_32_accT_VTAACCWIDTH(uint32_t *dst, acc_T **src, int y_size, int x_size, int y_block, int x_block) {
- assert((VTA_ACC_WIDTH * x_block * y_block) % 32 == 0);
- assert(32 <= 64);
- int buffer_idx = 0;
- int ratio = 32 / VTA_ACC_WIDTH;
- long long int mask = (1ULL << VTA_ACC_WIDTH) - 1;
- uint32_t tmp = 0;
- for (int i = 0; i < y_size / y_block; i++) {
- for (int j = 0; j < x_size / x_block; j++) {
- for (int k = 0; k < y_block; k++) {
- for (int l = 0; l < x_block; l++) {
- int block_idx = l + k * x_block;
- tmp |= (src[i * y_block + k][j * x_block + l] & mask) << ((block_idx % ratio) * VTA_ACC_WIDTH);
- // When tmp is packed, write to destination array
- if (block_idx % ratio == ratio - 1) {
- dst[buffer_idx++] = tmp;
- tmp = 0;
- }
- }
- }
- }
- }
- }
- void unpackBuffer(out_T **dst, uint32_t *src, int y_size, int x_size, int y_block, int x_block) {
- assert((VTA_OUT_WIDTH * x_block * y_block) % 32 == 0);
- int buffer_idx = 0;
- long long int mask = (1ULL << VTA_OUT_WIDTH) - 1;
- int ratio = 32 / VTA_OUT_WIDTH;
- for (int i = 0; i < y_size / y_block; i++) {
- for (int j = 0; j < x_size / x_block; j++) {
- for (int k = 0; k < y_block; k++) {
- for (int l = 0; l < x_block; l++) {
- int block_idx = l + k * x_block;
- dst[i * y_block + k][j * x_block + l] = (src[buffer_idx] >> ((block_idx % ratio) * VTA_OUT_WIDTH)) & mask;
- if (block_idx % ratio == ratio - 1) {
- buffer_idx++;
- }
- }
- }
- }
- }
- }
- int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_compression) {
- // Some assertions
- assert(batch % VTA_BATCH == 0);
- assert(vector_size % VTA_BLOCK_OUT == 0);
- printf("=====================================================================================\n");
- printf("INFO - ALU test of %s: batch=%d, vector_size=%d, uop_compression=%d\n", getOpcodeString(opcode, use_imm), batch, vector_size, uop_compression);
- // Instruction count
- int ins_size = 3 * batch / VTA_BATCH + 2;
- // Micro op count
- int uop_size = uop_compression ? 1 : vector_size / VTA_BLOCK_OUT;
- // Input/output elements in each transfer
- int tx_size = vector_size / VTA_BLOCK_OUT;
- // Number of input sets to be generated
- int input_sets = (use_imm) ? 1 : 2;
- // Make sure we don't exceed buffer bounds
- assert(uop_size <= VTA_UOP_BUFF_DEPTH);
- assert(tx_size * input_sets <= VTA_ACC_BUFF_DEPTH);
- // Immediate values
- acc_T *immediate = (acc_T *)(malloc(sizeof(acc_T) * batch / VTA_BATCH));
- for (int b = 0; b < batch / VTA_BATCH; b++) {
- if (opcode == VTA_ALU_OPCODE_MIN) {
- immediate[b] = (acc_T)(rand_r(&globalSeed) % (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 1)) - (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 2)));
- } else if (opcode == VTA_ALU_OPCODE_MAX) {
- immediate[b] = (acc_T)(rand_r(&globalSeed) % (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 1)) - (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 2)));
- } else if (opcode == VTA_ALU_OPCODE_ADD) {
- immediate[b] = (acc_T)(rand_r(&globalSeed) % (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 1)) - (1LL << (VTA_ALUOP_IMM_BIT_WIDTH - 2)));
- } else if (opcode == VTA_ALU_OPCODE_SHR) {
- immediate[b] = (acc_T)(rand_r(&globalSeed) % (1LL << (VTA_SHR_ARG_BIT_WIDTH - 1)) - (1LL << (VTA_SHR_ARG_BIT_WIDTH - 2)));
- }
- }
- // Initialize instructions
- VTAGenericInsn *insn_buf = (VTAGenericInsn *)(allocBuffer(sizeof(VTAGenericInsn) * ins_size));
- int insn_idx = 0;
- insn_buf[insn_idx++] = get1DLoadStoreInsn(VTA_OPCODE_LOAD, VTA_MEM_ID_UOP, 0, 0, uop_size, 0, 0, 0, 0);
- for (int b = 0; b < batch; b += VTA_BATCH) {
- insn_buf[insn_idx++] = get2DLoadStoreInsn(
- VTA_OPCODE_LOAD, // opcode
- VTA_MEM_ID_ACC, // vector size
- 0, // sram offset
- b / VTA_BATCH * tx_size * input_sets, // dram offset
- 1, // y size
- tx_size * input_sets, // x size
- tx_size * input_sets, // x stride
- 0, // y pad
- 0, // x pad
- 0, // pop prev dep
- b > 0, // pop next dep
- 0, // push prev dep
- 0); // push next dep
- insn_buf[insn_idx++] = getALUInsn(
- opcode, // opcode
- tx_size, // vector size
- use_imm, // use imm
- immediate[b / VTA_BATCH], // imm
- uop_compression, // uop compression
- 0, // pop prev dep
- 0, // pop next dep
- 0, // push prev dep
- 1); // push next dep
- insn_buf[insn_idx++] = get2DLoadStoreInsn(
- VTA_OPCODE_STORE, // opcode
- VTA_MEM_ID_OUT, // vector size
- 0, // sram offset
- b / VTA_BATCH * tx_size, // dram offset
- 1, // y size
- tx_size, // x size
- tx_size, // x stride
- 0, // y pad
- 0, // x pad
- 1, // pop prev dep
- 0, // pop next dep
- 1, // push prev dep
- 0); // push next dep
- }
- // Finish
- insn_buf[insn_idx++] = getFinishInsn(0, 1);
- // Prepare the uop buffer
- VTAUop * uop_buf = getMapALUUops(tx_size, uop_compression);
- // Initialize the input/output data
- acc_T **inputs = alloc2dArray_accT(batch, vector_size * input_sets);
- for (int i = 0; i < batch; i++) {
- for (int j = 0; j < vector_size * input_sets; j++) {
- if (opcode == VTA_ALU_OPCODE_MIN) {
- inputs[i][j] = (acc_T)(rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
- } else if (opcode == VTA_ALU_OPCODE_MAX) {
- inputs[i][j] = (acc_T)(rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
- } else if (opcode == VTA_ALU_OPCODE_ADD) {
- inputs[i][j] = (acc_T)(rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 2)) - (1LL << (VTA_INP_WIDTH - 3)));
- } else if (opcode == VTA_ALU_OPCODE_SHR) {
- inputs[i][j] = (acc_T)(rand_r(&globalSeed) % (1LL << (VTA_SHR_ARG_BIT_WIDTH - 1)) - (1LL << (VTA_SHR_ARG_BIT_WIDTH - 2)));
- }
- }
- }
- // Compute reference output
- out_T **outputs_ref = alloc2dArray_outT(batch, vector_size);
- for (int i = 0; i < batch; i++) {
- for (int j = 0; j < vector_size; j++) {
- acc_T out_val = 0;
- acc_T imm_val = immediate[i / VTA_BATCH];
- acc_T src_val = inputs[i][j + vector_size];
- if (opcode == VTA_ALU_OPCODE_MIN) {
- if (!use_imm) {
- out_val = inputs[i][j] < src_val ? inputs[i][j] : src_val;
- } else {
- out_val = inputs[i][j] < imm_val ? inputs[i][j] : imm_val;
- }
- } else if (opcode == VTA_ALU_OPCODE_MAX) {
- if (!use_imm) {
- out_val = inputs[i][j] > src_val ? inputs[i][j] : src_val;
- } else {
- out_val = inputs[i][j] > imm_val ? inputs[i][j] : imm_val;
- }
- } else if (opcode == VTA_ALU_OPCODE_ADD) {
- if (!use_imm) {
- out_val = inputs[i][j] + src_val;
- } else {
- out_val = inputs[i][j] + imm_val;
- }
- } else if (opcode == VTA_ALU_OPCODE_SHR) {
- if (!use_imm) {
- if (src_val >= 0) {
- out_val = inputs[i][j] >> src_val;
- } else {
- out_val = inputs[i][j] << (0 - src_val);
- }
- } else {
- if (imm_val >= 0) {
- out_val = inputs[i][j] >> imm_val;
- } else {
- out_val = inputs[i][j] << (0 - imm_val);
- }
- }
- }
- outputs_ref[i][j] = (out_T) out_val;
- }
- }
- // Pack input buffer
- uint32_t *bias_buf = (uint32_t *)(allocBuffer(VTA_ACC_ELEM_BYTES * batch * tx_size * input_sets));
- packBuffer_uint32_32_accT_VTAACCWIDTH(bias_buf, inputs, batch, vector_size * input_sets, VTA_BATCH, VTA_BLOCK_OUT);
- // Prepare output buffer
- uint32_t *output_buf = (uint32_t *)(allocBuffer(VTA_OUT_ELEM_BYTES * batch * tx_size * input_sets));
- // Invoke the VTA
- vta(ins_size, insn_buf, uop_buf, NULL, NULL, bias_buf, output_buf);
- // Unpack output buffer
- out_T **outputs = alloc2dArray_outT(batch, vector_size);
- unpackBuffer(outputs, output_buf, batch, vector_size, VTA_BATCH, VTA_BLOCK_OUT);
- // Correctness checks
- int err = 0;
- for (int i = 0; i < batch; i++) {
- for (int j = 0; j < vector_size; j++) {
- if (outputs_ref[i][j] != outputs[i][j]) {
- err++;
- printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j, (int)(outputs_ref[i][j]), (int)(outputs[i][j]));
- }
- }
- }
- // Free all allocated arrays
- free(immediate);
- if (err == 0) {
- printf("INFO - ALU test successful!\n");
- return 0;
- } else {
- printf("INFO - ALU test failed, got %d errors!\n", err);
- return -1;
- }
- }
- int main()
- {
- XUartLite UartLite;
- init_platform();
- print("Hello World\n\r");
- XUartLite_Initialize(&UartLite, XPAR_AXI_UARTLITE_0_DEVICE_ID);
- alu_test(VTA_ALU_OPCODE_MAX, true, 16, 128, true);
- print("Successfully ran Hello World application");
- cleanup_platform();
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement