Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- /*
- * Copyright (c) 2016 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- *
- * A stand alone test bed for abusing the Gen graphics command
- * streamer ALU
- */
- #include <stdint.h>
- #include <stdbool.h>
- #include <stdio.h>
- #include <assert.h>
- #include <math.h>
- #include <inttypes.h>
- #define max(A, B) ( (A)>(B) ? (A) : (B) )
- #define min(A, B) ( (A)<(B) ? (A) : (B) )
- #define ARRAY_SIZE(X) (sizeof(X)/sizeof(X[0]))
- #define BEGIN_BATCH(X) do { (void)(X); } while(0)
- #define ADVANCE_BATCH(X)
- #define OUT_BATCH(X) do { batch[batch_ptr++] = (X); } while(0)
- #define CMD_MI (0x0 << 29)
- #define MI_LOAD_REGISTER_IMM (CMD_MI | (0x22 << 23))
- #define MI_LOAD_REGISTER_REG (CMD_MI | (0x2A << 23))
- #define HSW_MI_MATH (CMD_MI | (0x1a << 23))
- #define MI_MATH_OPCODE_NOOP 0x000
- #define MI_MATH_OPCODE_LOAD 0x080
- #define MI_MATH_OPCODE_LOADINV 0x480
- #define MI_MATH_OPCODE_LOAD0 0x081
- #define MI_MATH_OPCODE_LOAD1 0x481
- #define MI_MATH_OPCODE_ADD 0x100
- #define MI_MATH_OPCODE_SUB 0x101
- #define MI_MATH_OPCODE_AND 0x102
- #define MI_MATH_OPCODE_OR 0x103
- #define MI_MATH_OPCODE_XOR 0x104
- #define MI_MATH_OPCODE_STORE 0x180
- #define MI_MATH_OPCODE_STOREINV 0x580
- #define MI_MATH_OPERAND_R0 0x00
- #define MI_MATH_OPERAND_R1 0x01
- #define MI_MATH_OPERAND_R2 0x02
- #define MI_MATH_OPERAND_R3 0x03
- #define MI_MATH_OPERAND_R4 0x04
- #define MI_MATH_OPERAND_SRCA 0x20
- #define MI_MATH_OPERAND_SRCB 0x21
- #define MI_MATH_OPERAND_ACCU 0x31
- #define MI_MATH_OPERAND_ZF 0x32
- #define MI_MATH_OPERAND_CF 0x33
- #define MI_MATH_ALU2(opcode, operand1, operand2) \
- ( ((MI_MATH_OPCODE_##opcode) << 20) | (operand1 << 10) | operand2 )
- #define MI_MATH_ALU1(opcode, operand1) \
- ( ((MI_MATH_OPCODE_##opcode) << 20) | (operand1 << 10) )
- #define MI_MATH_ALU0(opcode) \
- ( ((MI_MATH_OPCODE_##opcode) << 20) )
- #define HSW_CS_GPR(n) (0x2600 + (n) * 8)
- /* Just to improve readability a bit */
- #define R0 0
- static uint32_t batch[1024];
- static int batch_ptr;
- static struct brw_context {
- int gen;
- } _brw = {
- .gen = 9
- };
- /*
- * ALU Summary:
- *
- * Two u64 source registers: SRCA/B
- * Sixteen general purpose u64 registers: R0..R15
- * u64 accumulator: ACCU
- * zero and carry flags: ZF, CF
- * OPS: load, store, addition, subtraction, AND, OR, XOR
- */
- static uint64_t alu_accu;
- static uint64_t alu_srca;
- static uint64_t alu_srcb;
- static uint64_t alu_zf;
- static uint64_t alu_cf;
- static uint64_t alu_gprs[16];
- static void
- alu_load(int operand0_dst, int operand1_gpr)
- {
- if (operand1_gpr < 0 || operand1_gpr >= 16) {
- fprintf(stderr, "invalid ALU GPR\n");
- return;
- }
- if (operand0_dst == MI_MATH_OPERAND_SRCA)
- alu_srca = alu_gprs[operand1_gpr];
- else if (operand0_dst == MI_MATH_OPERAND_SRCB)
- alu_srcb = alu_gprs[operand1_gpr];
- else
- fprintf(stderr, "invalid load operand0 destination (should be SRCA or SRCB)\n");
- }
- static void
- alu_loadinv(int operand0_dst, int operand1_gpr)
- {
- if (operand1_gpr < 0 || operand1_gpr >= 16) {
- fprintf(stderr, "invalid ALU GPR\n");
- return;
- }
- if (operand0_dst == MI_MATH_OPERAND_SRCA)
- alu_srca = ~alu_gprs[operand1_gpr];
- else if (operand0_dst == MI_MATH_OPERAND_SRCB)
- alu_srcb = ~alu_gprs[operand1_gpr];
- else
- fprintf(stderr, "invalid load operand0 destination (should be SRCA or SRCB)\n");
- }
- static void
- alu_load0(int operand0_dst)
- {
- if (operand0_dst == MI_MATH_OPERAND_SRCA)
- alu_srca = 0;
- else if (operand0_dst == MI_MATH_OPERAND_SRCB)
- alu_srcb = 0;
- else
- fprintf(stderr, "invalid load0 operand0 destination (should be SRCA or SRCB)\n");
- }
- static void
- alu_load1(int operand0_dst)
- {
- if (operand0_dst == MI_MATH_OPERAND_SRCA)
- alu_srca = 1;
- else if (operand0_dst == MI_MATH_OPERAND_SRCB)
- alu_srcb = 1;
- else
- fprintf(stderr, "invalid load1 operand0 destination (should be SRCA or SRCB)\n");
- }
- static void
- alu_store(int operand0_gpr_dst,
- int operand1_src)
- {
- if (operand0_gpr_dst < 0 || operand0_gpr_dst >= 16) {
- fprintf(stderr, "invalid ALU GPR\n");
- return;
- }
- switch (operand1_src) {
- case MI_MATH_OPERAND_ACCU:
- alu_gprs[operand0_gpr_dst] = alu_accu;
- break;
- case MI_MATH_OPERAND_ZF:
- alu_gprs[operand0_gpr_dst] = alu_zf ? ~0ULL : 0ULL;
- break;
- case MI_MATH_OPERAND_CF:
- alu_gprs[operand0_gpr_dst] = alu_cf ? ~0ULL : 0ULL;
- break;
- default:
- fprintf(stderr, "invalid store operand (should be ACCU, ZF or CF)\n");
- }
- }
- /* Implement as cascaded 32bit adds so we can easily handle the add with carry */
- static void
- alu_add(void)
- {
- uint64_t lower = (alu_srca & 0xffffffff) + (alu_srcb & 0xffffffff);
- uint64_t carry = lower > UINT32_MAX ? 1: 0;
- uint64_t upper = (alu_srca >> 32) +
- (alu_srcb >> 32) +
- carry;
- alu_cf = upper > UINT32_MAX ? 1: 0;
- alu_accu = (upper << 32) | lower;
- alu_zf = !alu_accu;
- }
- /* FIXME: handle carry */
- static void
- alu_sub(void)
- {
- alu_cf = 0;
- alu_accu = alu_srca - alu_srcb;
- alu_zf = !alu_accu;
- }
- static void
- alu_and(void)
- {
- alu_accu = alu_srca & alu_srcb;
- alu_zf = !alu_accu;
- }
- static void
- alu_or(void)
- {
- alu_accu = alu_srca | alu_srcb;
- alu_zf = !alu_accu;
- }
- static void
- alu_xor(void)
- {
- alu_accu = alu_srca ^ alu_srcb;
- alu_zf = !alu_accu;
- }
- static const char *
- operand_name(int operand)
- {
- static const char *gpr_names[] = {
- "R0",
- "R1",
- "R2",
- "R3",
- "R4",
- "R5",
- "R6",
- "R7",
- "R8",
- "R9",
- "R10",
- "R11",
- "R12",
- "R13",
- "R14",
- "R15",
- };
- if (operand < 16) {
- return gpr_names[operand];
- }
- switch (operand) {
- case MI_MATH_OPERAND_SRCA:
- return "SRCA";
- case MI_MATH_OPERAND_SRCB:
- return "SRCB";
- case MI_MATH_OPERAND_ACCU:
- return "ACCU";
- case MI_MATH_OPERAND_ZF:
- return "ZF";
- case MI_MATH_OPERAND_CF:
- return "CF";
- default:
- fprintf(stderr, "Unknown operand\n");
- assert(0);
- return NULL;
- }
- }
- static uint64_t
- operand_val(int operand)
- {
- if (operand < 16) {
- return alu_gprs[operand];
- }
- switch (operand) {
- case MI_MATH_OPERAND_SRCA:
- return alu_srca;
- case MI_MATH_OPERAND_SRCB:
- return alu_srcb;
- case MI_MATH_OPERAND_ACCU:
- return alu_accu;
- case MI_MATH_OPERAND_ZF:
- return alu_zf;
- case MI_MATH_OPERAND_CF:
- return alu_cf;
- default:
- fprintf(stderr, "Unknown operand\n");
- assert(0);
- return 0;
- }
- }
- static void
- write_reg(uint32_t reg, uint32_t val)
- {
- assert((reg & 0x3) == 0);
- if (reg >= 0x2600 && reg < 0x2600 + 8 * 16) {
- int gpr = reg - 0x2600;
- gpr /= 4;
- ((uint32_t *)alu_gprs)[gpr] = val;
- printf("%50s: %0x=%u\n", "", reg, val);
- } else
- fprintf(stderr, "Unknown register");
- }
- static int mi_lri_parser_offset;
- static int mi_lri_parser_reg;
- static void
- mi_lri_parse(uint32_t word)
- {
- if (mi_lri_parser_offset++ & 1)
- write_reg(mi_lri_parser_reg, word);
- else
- mi_lri_parser_reg = word;
- }
- static uint32_t
- load_reg(uint32_t reg)
- {
- assert((reg & 0x3) == 0);
- if (reg >= 0x2600 && reg < 0x2600 + 8 * 16) {
- int gpr = reg - 0x2600;
- gpr /= 4;
- return ((uint32_t *)alu_gprs)[gpr];
- } else {
- fprintf(stderr, "Unknown register");
- return 0;
- }
- }
- static int mi_lrr_parser_offset;
- static int mi_lrr_parser_src_reg;
- static void
- mi_lrr_parse(uint32_t word)
- {
- if (mi_lrr_parser_offset++ & 1) {
- uint32_t val = load_reg(mi_lrr_parser_src_reg);
- write_reg(word, val);
- } else
- mi_lrr_parser_src_reg = word;
- }
- static void
- mi_math_parse(uint32_t word)
- {
- uint32_t opcode;
- uint32_t operand0, operand1;
- /* ALU instruction = 20bit opcode : 10bit operand0 : 10bit operand1 */
- opcode = (word & (0xfff << 20)) >> 20;
- operand0 = (word & (0x3ff << 10)) >> 10;
- operand1 = word & 0x3ff;
- switch (opcode) {
- case MI_MATH_OPCODE_LOAD:
- printf(" LOAD %s=%"PRIu64" %s=%"PRIu64"\n",
- operand_name(operand0),
- operand_val(operand0),
- operand_name(operand1),
- operand_val(operand1));
- alu_load(operand0, operand1);
- printf("%50s: %s=%"PRIu64"\n", "",
- operand_name(operand0),
- operand_val(operand0));
- break;
- case MI_MATH_OPCODE_LOADINV:
- printf(" LOADINV %s=%"PRIu64" %s=%"PRIu64"\n",
- operand_name(operand0),
- operand_val(operand0),
- operand_name(operand1),
- operand_val(operand1));
- alu_loadinv(operand0, operand1);
- printf("%50s: %s=%"PRIu64"\n", "",
- operand_name(operand0),
- operand_val(operand0));
- break;
- case MI_MATH_OPCODE_LOAD0:
- printf(" LOAD0 %s=%"PRIu64"\n",
- operand_name(operand0),
- operand_val(operand0));
- alu_load0(operand0);
- printf("%50s: %s=%"PRIu64"\n", "",
- operand_name(operand0),
- operand_val(operand0));
- break;
- case MI_MATH_OPCODE_LOAD1:
- printf(" LOAD1 %s=%"PRIu64"\n",
- operand_name(operand0),
- operand_val(operand0));
- alu_load1(operand0);
- printf("%50s: %s=%"PRIu64"\n", "",
- operand_name(operand0),
- operand_val(operand0));
- break;
- case MI_MATH_OPCODE_STORE:
- printf(" STORE %s=%"PRIu64" %s=%"PRIu64"\n",
- operand_name(operand0),
- operand_val(operand0),
- operand_name(operand1),
- operand_val(operand1));
- alu_store(operand0, operand1);
- printf("%50s: %s=%"PRIu64"\n", "",
- operand_name(operand0),
- operand_val(operand0));
- break;
- case MI_MATH_OPCODE_ADD:
- printf(" ADD (A=%"PRIu64", B=%"PRIu64")\n",
- operand_val(MI_MATH_OPERAND_SRCA),
- operand_val(MI_MATH_OPERAND_SRCB));
- alu_add();
- break;
- case MI_MATH_OPCODE_SUB:
- printf(" SUB (A=%"PRIu64", B=%"PRIu64")\n",
- operand_val(MI_MATH_OPERAND_SRCA),
- operand_val(MI_MATH_OPERAND_SRCB));
- alu_sub();
- break;
- case MI_MATH_OPCODE_AND:
- printf(" AND (A=%"PRIu64", B=%"PRIx64")\n",
- operand_val(MI_MATH_OPERAND_SRCA),
- operand_val(MI_MATH_OPERAND_SRCB));
- alu_and();
- break;
- case MI_MATH_OPCODE_OR:
- printf(" OR (A=%"PRIu64", B=%"PRIu64")\n",
- operand_val(MI_MATH_OPERAND_SRCA),
- operand_val(MI_MATH_OPERAND_SRCB));
- alu_or();
- break;
- case MI_MATH_OPCODE_XOR:
- printf(" XOR (A=%"PRIu64", B=%"PRIx64")\n",
- operand_val(MI_MATH_OPERAND_SRCA),
- operand_val(MI_MATH_OPERAND_SRCB));
- alu_xor();
- break;
- }
- printf("%50s: ACCU=%"PRIu64", CF=%d, ZF=%d\n", "", alu_accu, !!alu_cf, !!alu_zf);
- }
- static void (*sub_cmd_parser)(uint32_t word);
- static void
- rcs_parse(uint32_t word)
- {
- int engine = word & (0x3 << 29);
- /* number of dwords left before we should pop to prev parser */
- static int cmd_remainder = 0;
- if (!sub_cmd_parser) {
- if (engine == CMD_MI) {
- #define MI_OPCODE(X) ((X & (0x3f << 23)) >> 23)
- switch (MI_OPCODE(word)) {
- case MI_OPCODE(MI_LOAD_REGISTER_IMM):
- mi_lri_parser_offset = 0;
- cmd_remainder = (word & 0x7f) + 2;
- printf("MI_LRI(len = %d)\n", cmd_remainder);
- sub_cmd_parser = mi_lri_parse;
- break;
- case MI_OPCODE(MI_LOAD_REGISTER_REG):
- mi_lrr_parser_offset = 0;
- cmd_remainder = (word & 0x7f) + 2;
- printf("MI_LRR(len = %d)\n", cmd_remainder);
- sub_cmd_parser = mi_lrr_parse;
- break;
- case MI_OPCODE(HSW_MI_MATH):
- cmd_remainder = (word & 0x7f) + 2;
- printf("MI_MATH(len = %d)\n", cmd_remainder);
- sub_cmd_parser = mi_math_parse;
- break;
- default:
- fprintf(stderr, "Unhandled MI command\n");
- }
- #undef MI_OPCODE
- } else {
- fprintf(stderr, "Only handling MI commands\n");
- }
- cmd_remainder--;
- } else {
- sub_cmd_parser(word);
- if (--cmd_remainder == 0)
- sub_cmd_parser = NULL;
- }
- }
- static void
- brw_load_register_imm64(struct brw_context *brw, uint32_t reg, uint64_t imm)
- {
- BEGIN_BATCH(5);
- OUT_BATCH(MI_LOAD_REGISTER_IMM | (5 - 2));
- OUT_BATCH(reg);
- OUT_BATCH(imm & 0xffffffff);
- OUT_BATCH(reg + 4);
- OUT_BATCH(imm >> 32);
- ADVANCE_BATCH();
- }
- static void
- brw_load_register_imm32(struct brw_context *brw, uint32_t reg, uint32_t imm)
- {
- BEGIN_BATCH(3);
- OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
- OUT_BATCH(reg);
- OUT_BATCH(imm);
- ADVANCE_BATCH();
- }
- static void
- brw_load_register_reg(struct brw_context *brw, uint32_t dest, uint32_t src)
- {
- BEGIN_BATCH(3);
- OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
- OUT_BATCH(src);
- OUT_BATCH(dest);
- ADVANCE_BATCH();
- }
- static void
- alu_logic_op_gpr_u64_with_tmp(struct brw_context *brw,
- uint32_t op, /* AND, OR, XOR */
- int operand0_gpr, /* 0-15 */
- uint64_t operand1_imm,
- int tmp_gpr) /* 0-15 */
- {
- uint32_t maths[] = {
- MI_MATH_ALU2(LOAD, MI_MATH_OPERAND_SRCA, operand0_gpr),
- MI_MATH_ALU2(LOAD, MI_MATH_OPERAND_SRCB, tmp_gpr),
- op << 20,
- MI_MATH_ALU0(AND),
- MI_MATH_ALU2(STORE, operand0_gpr, MI_MATH_OPERAND_ACCU),
- };
- assert(operand0_gpr != tmp_gpr);
- brw_load_register_imm64(brw, HSW_CS_GPR(tmp_gpr), operand1_imm);
- BEGIN_BATCH(1 + ARRAY_SIZE(maths));
- OUT_BATCH(HSW_MI_MATH | (1 + ARRAY_SIZE(maths) - 2));
- for (int m = 0; m < ARRAY_SIZE(maths); m++)
- OUT_BATCH(maths[m]);
- ADVANCE_BATCH();
- }
- /* left shift any GPR using one other GPR as a temporary */
- static void
- alu_lshift_gpr_with_tmp(struct brw_context *brw,
- int gpr,
- int shift,
- int gpr_tmp)
- {
- uint32_t left_shift[] = {
- MI_MATH_ALU2(LOAD, MI_MATH_OPERAND_SRCA, gpr),
- MI_MATH_ALU2(LOAD, MI_MATH_OPERAND_SRCB, gpr),
- MI_MATH_ALU0(ADD),
- MI_MATH_ALU2(STORE, gpr, MI_MATH_OPERAND_ACCU),
- };
- uint64_t top_mask = (1ULL << (64 - shift)) - 1;
- int max_math_ops = brw->gen >= 9 ? 128 : 32;
- /* XXX: assuming array size is a factor of max ops... */
- int max_shifts = max_math_ops / ARRAY_SIZE(left_shift);
- int n_cmds = (shift + max_shifts - 1) / max_shifts;
- int batch_len = n_cmds * 4 + shift * ARRAY_SIZE(left_shift);
- assert(shift > 0);
- assert(shift < 64);
- /* Copying hsw_queryobj.c idea here and masking out the top
- * bits to avoid overflow.
- *
- * XXX: any reason to really worry about setting CF?
- * TODO: double check details of how ALU handles overflow
- */
- alu_logic_op_gpr_u64_with_tmp(brw,
- MI_MATH_OPCODE_AND,
- gpr,
- top_mask,
- gpr_tmp);
- BEGIN_BATCH(batch_len);
- while (shift) {
- int n_shifts = min(max_shifts, shift);
- int cmd_len = ARRAY_SIZE(left_shift) * n_shifts + 1;
- OUT_BATCH(HSW_MI_MATH | (cmd_len - 2));
- for (int i = 0; i < n_shifts; i++) {
- for (int m = 0; m < ARRAY_SIZE(left_shift); m++)
- OUT_BATCH(left_shift[m]);
- }
- shift -= n_shifts;
- }
- ADVANCE_BATCH();
- }
- /* Since the upper and lower 32bit words of the ALU's general purpose registers
- * are addressable via mmio we can implement a right shift in terms of shifting
- * left by (32-n) bits and then keep the upper 32bits.
- *
- * The is limited to shifts < 32 bits
- *
- * In the worst case a rshift by one bit will result in 248 math ops plus
- * headers, three LRRs and an LRI. That sounds like a lot, but still
- * I *guess* the ALU operations being so simple only take one or two
- * clocks each.
- *
- * Totally guessing at the ALU taking maybe 2 cycles per ADD with the
- * GPU running at 500MHz that would be less than a microsecond, so
- * that could still be fine if the work is separate from the
- * application capturing metrics, only done when storing the results.
- *
- * XXX: There are probably common cases where it can be assumed that
- * upper 32bits of the result should be zero and the number of
- * instructions could be more than halved.
- */
- static void
- alu_rshift_gpr_32_with_2_tmp(struct brw_context *brw,
- int gpr,
- int shift,
- int gpr_tmp0,
- int gpr_tmp1)
- {
- int lshift = 32 - shift;
- assert(shift > 0);
- assert(shift < 32);
- /* First copy the upper 32bits, to be shifted separately */
- brw_load_register_reg(brw, HSW_CS_GPR(gpr_tmp1), HSW_CS_GPR(gpr) + 4);
- brw_load_register_imm32(brw, HSW_CS_GPR(gpr_tmp1) + 4, 0);
- /* Shift the lower 32bits first */
- alu_lshift_gpr_with_tmp(brw, gpr, lshift, gpr_tmp0);
- brw_load_register_reg(brw, HSW_CS_GPR(gpr), HSW_CS_GPR(gpr) + 4);
- /* Shift the upper 32bits */
- alu_lshift_gpr_with_tmp(brw, gpr_tmp1, lshift, gpr_tmp0);
- brw_load_register_reg(brw, HSW_CS_GPR(gpr) + 4, HSW_CS_GPR(gpr_tmp1) + 4);
- }
- /* Nabbed from src/util/bitscan.h */
- static inline unsigned
- util_last_bit64(uint64_t u)
- {
- #if defined(HAVE___BUILTIN_CLZLL)
- return u == 0 ? 0 : 64 - __builtin_clzll(u);
- #elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM || _M_IA64)
- unsigned long index;
- if (_BitScanReverse64(&index, u))
- return index + 1;
- else
- return 0;
- #else
- unsigned r = 0;
- while (u) {
- r++;
- u >>= 1;
- }
- return r;
- #endif
- }
- /* Approximate an arbitrary multiplication by separating into a
- * converging sum of power-of-two multiplications.
- *
- * Note: this potentially uses all 16 of the ALU registers, depending
- * on the number of bits set in @factor.
- *
- * Note: We don't limit the factor to 16bits since we want to support
- * 48:16 fixed point factors when we need to multiply by a fraction,
- * but in general we only have 16bit of precision for the factor.
- */
- static void
- alu_mul_gpr0_u64(struct brw_context *brw, uint64_t factor)
- {
- int max_math_ops = brw->gen >= 9 ? 128 : 32;
- /* A single lshift is 4 ops: 2 LOADs into A/B, ADD, STORE back */
- int ops_per_lshift = 4;
- int max_shifts = max_math_ops / ops_per_lshift;
- /* We have up 16 GPRs we can use to save intermediate POT multiplications */
- #define MAX_STEPS 16
- int pot_shifts[MAX_STEPS + 1]; /* room to zero terminate */
- int n_steps;
- if (factor == 0) {
- BEGIN_BATCH(3 * 4);
- OUT_BATCH(HSW_MI_MATH | (3 - 2));
- OUT_BATCH(MI_MATH_ALU1(LOAD0, MI_MATH_OPERAND_SRCA));
- OUT_BATCH(MI_MATH_ALU2(STORE, R0, MI_MATH_OPERAND_SRCA));
- ADVANCE_BATCH();
- return;
- }
- /* Determine how many separate power of two multiplications to
- * decompose this in to.
- *
- * We scan from most significant bits to the least. If the factor
- * has > MAX_STEPS bits set then the least significant bits will be
- * ignored, making the result less precise.
- */
- for (n_steps = 0; n_steps < MAX_STEPS && factor; n_steps++) {
- pot_shifts[n_steps] = util_last_bit64(factor) - 1;
- factor -= (1 << pot_shifts[n_steps]);
- }
- /* zero terminate for calculating deltas below */
- pot_shifts[n_steps] = 0;
- /* Example states:
- * factor = 81, pot_shifts[0]=6, pot_shifts[1]=4, pot_shifts[2]=0, n_steps = 3
- * factor = 80, pot_shifts[0]=6, pot_shifts[1]=4, n_steps = 2
- * factor = 3, pot_shifts[0]=1, pot_shifts[1]=0, n_steps = 2
- * factor = 2, pot_shifts[0]=1, n_steps = 1
- * factor = 1, pot_shifts[0]=0, n_steps = 1
- */
- /* Starting with the least significant POT factor we shift the
- * original value starting in R0 towards the most significant POT
- * factor.
- *
- * For each step we progress which general purpose register we
- * save the result into, with the first step always loading and
- * saving into R0, then the next step loads from R0 saving to
- * R1, for a total of up to 16 intermediate POT multiplications.
- *
- * Note: the first step might be a factor of one (shift 0), which is
- * a NOOP
- */
- for (int i = pot_shifts[n_steps - 1] ? 0 : 1; i < n_steps; i++) {
- int step_load_gpr = i ? i - 1: i; /* load + store to R0 for first step */
- int step_store_gpr = i;
- /* How far to shift before reaching the next step? */
- int shift = pot_shifts[n_steps - i - 1] - pot_shifts[n_steps - i];
- /* Note: careful to round up here... */
- int n_cmds = (shift + max_shifts - 1) / max_shifts;
- int batch_len = n_cmds * 4 + shift * ops_per_lshift;
- bool first_step_shift = true;
- BEGIN_BATCH(batch_len);
- while (shift) {
- int n_packed_shifts = min(max_shifts, shift);
- int cmd_len = ops_per_lshift * n_packed_shifts + 1;
- OUT_BATCH(HSW_MI_MATH | (cmd_len - 2));
- for (int i = 0; i < n_packed_shifts; i++) {
- int load_gpr = first_step_shift ? step_load_gpr : step_store_gpr;
- OUT_BATCH(MI_MATH_ALU2(LOAD, MI_MATH_OPERAND_SRCA, load_gpr));
- OUT_BATCH(MI_MATH_ALU2(LOAD, MI_MATH_OPERAND_SRCB, load_gpr));
- OUT_BATCH(MI_MATH_ALU0(ADD));
- OUT_BATCH(MI_MATH_ALU2(STORE, step_store_gpr,
- MI_MATH_OPERAND_ACCU));
- first_step_shift = false;
- }
- shift -= n_packed_shifts;
- }
- ADVANCE_BATCH();
- }
- /* If the multiplication was split up then sum intermediate values... */
- if (n_steps > 1) {
- int ops_per_add = 4; /* 2 LOADS in to A/B, ADD, STORE back */
- int max_adds = max_math_ops / ops_per_add;
- int adds = n_steps - 1;
- /* Note: careful to round up here... */
- int n_cmds = (adds + max_adds - 1) / max_adds;
- int batch_len = n_cmds * 4 + adds * ops_per_add;
- int saved = R0 + 1;
- BEGIN_BATCH(batch_len);
- while (adds) {
- int n_packed_adds = min(max_adds, adds);
- int cmd_len = ops_per_add * n_packed_adds + 1;
- OUT_BATCH(HSW_MI_MATH | (cmd_len - 2));
- for (int i = 0; i < n_packed_adds; i++) {
- OUT_BATCH(MI_MATH_ALU2(LOAD, MI_MATH_OPERAND_SRCA, R0));
- OUT_BATCH(MI_MATH_ALU2(LOAD, MI_MATH_OPERAND_SRCB, saved));
- OUT_BATCH(MI_MATH_ALU0(ADD));
- OUT_BATCH(MI_MATH_ALU2(STORE, R0, MI_MATH_OPERAND_ACCU));
- saved++;
- }
- adds -= n_packed_adds;
- }
- ADVANCE_BATCH();
- }
- }
- static void
- alu_mul_gpr0_float(struct brw_context *brw, float factor)
- {
- assert(factor >= 0);
- if (floorf(factor) != factor) {
- /* If we need to multiply by a floating point factor then
- * scaling by a further 2^16 will effectively result in a
- * 48:16 fixed point value...
- */
- alu_mul_gpr0_u64(brw, factor * 65536);
- /* So now we just need to drop the fixed point fraction */
- alu_rshift_gpr_32_with_2_tmp(brw, R0, 16, R0 + 1, R0 + 2);
- } else
- alu_mul_gpr0_u64(brw, factor);
- }
- int
- main(int argc, char **argv)
- {
- alu_gprs[0] = 123; /* load 'timestamp' */
- //alu_mul_gpr0_float(&_brw, 64);
- //alu_mul_gpr0_float(&_brw, 80);
- //alu_mul_gpr0_float(&_brw, 83);
- alu_mul_gpr0_float(&_brw, 83.33);
- printf("Batch length = %d words\n", batch_ptr);
- printf("Initial GPR0 = %"PRIu64"\n", alu_gprs[0]);
- for (int i = 0; i < batch_ptr; i++)
- rcs_parse(batch[i]);
- printf("Final GPR0 = %"PRIu64"\n", alu_gprs[0]);
- return 0;
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement