General Multiplication with Gen RCS ALU

/*
 * Copyright (c) 2016 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 *
 *
 * A stand alone test bed for abusing the Gen graphics command
 * streamer ALU
 */

#include <stdint.h>
#include <stdbool.h>
#include <stdio.h>
#include <assert.h>
#include <math.h>
#include <inttypes.h>

#define max(A, B) ( (A)>(B) ? (A) : (B) )
#define min(A, B) ( (A)<(B) ? (A) : (B) )

#define ARRAY_SIZE(X) (sizeof(X)/sizeof(X[0]))

#define BEGIN_BATCH(X) do { (void)(X); } while(0)
#define ADVANCE_BATCH(X)
#define OUT_BATCH(X) do { batch[batch_ptr++] = (X); } while(0)

#define CMD_MI              (0x0 << 29)

#define MI_LOAD_REGISTER_IMM        (CMD_MI | (0x22 << 23))
#define MI_LOAD_REGISTER_REG        (CMD_MI | (0x2A << 23))

#define HSW_MI_MATH         (CMD_MI | (0x1a << 23))

#define MI_MATH_OPCODE_NOOP      0x000
#define MI_MATH_OPCODE_LOAD      0x080
#define MI_MATH_OPCODE_LOADINV   0x480
#define MI_MATH_OPCODE_LOAD0     0x081
#define MI_MATH_OPCODE_LOAD1     0x481
#define MI_MATH_OPCODE_ADD       0x100
#define MI_MATH_OPCODE_SUB       0x101
#define MI_MATH_OPCODE_AND       0x102
#define MI_MATH_OPCODE_OR        0x103
#define MI_MATH_OPCODE_XOR       0x104
#define MI_MATH_OPCODE_STORE     0x180
#define MI_MATH_OPCODE_STOREINV  0x580

#define MI_MATH_OPERAND_R0   0x00
#define MI_MATH_OPERAND_R1   0x01
#define MI_MATH_OPERAND_R2   0x02
#define MI_MATH_OPERAND_R3   0x03
#define MI_MATH_OPERAND_R4   0x04
#define MI_MATH_OPERAND_SRCA 0x20
#define MI_MATH_OPERAND_SRCB 0x21
#define MI_MATH_OPERAND_ACCU 0x31
#define MI_MATH_OPERAND_ZF   0x32
#define MI_MATH_OPERAND_CF   0x33

#define MI_MATH_ALU2(opcode, operand1, operand2) \
   ( ((MI_MATH_OPCODE_##opcode) << 20) | (operand1 << 10) | operand2 )

#define MI_MATH_ALU1(opcode, operand1) \
   ( ((MI_MATH_OPCODE_##opcode) << 20) | (operand1 << 10) )

#define MI_MATH_ALU0(opcode) \
   ( ((MI_MATH_OPCODE_##opcode) << 20) )

#define HSW_CS_GPR(n) (0x2600 + (n) * 8)

/* Just to improve readability a bit */
#define R0 0

static uint32_t batch[1024];
static int batch_ptr;

static struct brw_context {
    int gen;
} _brw = {
    .gen = 9
};

/*
 * ALU Summary:
 *
 * Two u64 source registers: SRCA/B
 * Sixteen general purpose u64 registers: R0..R15
 * u64 accumulator: ACCU
 * zero and carry flags: ZF, CF
 * OPS: load, store, addition, subtraction, AND, OR, XOR
 */
static uint64_t alu_accu;
static uint64_t alu_srca;
static uint64_t alu_srcb;
static uint64_t alu_zf;
static uint64_t alu_cf;
static uint64_t alu_gprs[16];


static void
alu_load(int operand0_dst, int operand1_gpr)
{
  if (operand1_gpr < 0 || operand1_gpr >= 16) {
      fprintf(stderr, "invalid ALU GPR\n");
      return;
  }

  if (operand0_dst == MI_MATH_OPERAND_SRCA)
    alu_srca = alu_gprs[operand1_gpr];
  else if (operand0_dst == MI_MATH_OPERAND_SRCB)
    alu_srcb = alu_gprs[operand1_gpr];
  else
    fprintf(stderr, "invalid load operand0 destination (should be SRCA or SRCB)\n");
}

static void
alu_loadinv(int operand0_dst, int operand1_gpr)
{
  if (operand1_gpr < 0 || operand1_gpr >= 16) {
      fprintf(stderr, "invalid ALU GPR\n");
      return;
  }

  if (operand0_dst == MI_MATH_OPERAND_SRCA)
    alu_srca = ~alu_gprs[operand1_gpr];
  else if (operand0_dst == MI_MATH_OPERAND_SRCB)
    alu_srcb = ~alu_gprs[operand1_gpr];
  else
    fprintf(stderr, "invalid load operand0 destination (should be SRCA or SRCB)\n");
}

static void
alu_load0(int operand0_dst)
{
  if (operand0_dst == MI_MATH_OPERAND_SRCA)
    alu_srca = 0;
  else if (operand0_dst == MI_MATH_OPERAND_SRCB)
    alu_srcb = 0;
  else
    fprintf(stderr, "invalid load0 operand0 destination (should be SRCA or SRCB)\n");
}

static void
alu_load1(int operand0_dst)
{
  if (operand0_dst == MI_MATH_OPERAND_SRCA)
    alu_srca = 1;
  else if (operand0_dst == MI_MATH_OPERAND_SRCB)
    alu_srcb = 1;
  else
    fprintf(stderr, "invalid load1 operand0 destination (should be SRCA or SRCB)\n");
}

static void
alu_store(int operand0_gpr_dst,
          int operand1_src)
{
  if (operand0_gpr_dst < 0 || operand0_gpr_dst >= 16) {
      fprintf(stderr, "invalid ALU GPR\n");
      return;
  }

  switch (operand1_src) {
    case MI_MATH_OPERAND_ACCU:
      alu_gprs[operand0_gpr_dst] = alu_accu;
      break;
    case MI_MATH_OPERAND_ZF:
      alu_gprs[operand0_gpr_dst] = alu_zf ? ~0ULL : 0ULL;
      break;
    case MI_MATH_OPERAND_CF:
      alu_gprs[operand0_gpr_dst] = alu_cf ? ~0ULL : 0ULL;
      break;
    default:
      fprintf(stderr, "invalid store operand (should be ACCU, ZF or CF)\n");
  }
}

/* Implement as cascaded 32bit adds so we can easily handle the add with carry */
static void
alu_add(void)
{
  uint64_t lower = (alu_srca & 0xffffffff) + (alu_srcb & 0xffffffff);
  uint64_t carry = lower > UINT32_MAX ? 1: 0;
  uint64_t upper = (alu_srca >> 32) +
                   (alu_srcb >> 32) +
                   carry;

  alu_cf = upper > UINT32_MAX ? 1: 0;
  alu_accu = (upper << 32) | lower;
  alu_zf = !alu_accu;
}

/* FIXME: handle carry */
static void
alu_sub(void)
{
  alu_cf = 0;
  alu_accu = alu_srca - alu_srcb;
  alu_zf = !alu_accu;
}

static void
alu_and(void)
{
  alu_accu = alu_srca & alu_srcb;
  alu_zf = !alu_accu;
}

static void
alu_or(void)
{
  alu_accu = alu_srca | alu_srcb;
  alu_zf = !alu_accu;
}

static void
alu_xor(void)
{
  alu_accu = alu_srca ^ alu_srcb;
  alu_zf = !alu_accu;
}

static const char *
operand_name(int operand)
{
  static const char *gpr_names[] = {
      "R0",
      "R1",
      "R2",
      "R3",
      "R4",
      "R5",
      "R6",
      "R7",
      "R8",
      "R9",
      "R10",
      "R11",
      "R12",
      "R13",
      "R14",
      "R15",
  };

  if (operand < 16) {
      return gpr_names[operand];
  }
  switch (operand) {
    case MI_MATH_OPERAND_SRCA:
      return "SRCA";
    case MI_MATH_OPERAND_SRCB:
      return "SRCB";
    case MI_MATH_OPERAND_ACCU:
      return "ACCU";
    case MI_MATH_OPERAND_ZF:
      return "ZF";
    case MI_MATH_OPERAND_CF:
      return "CF";
    default:
      fprintf(stderr, "Unknown operand\n");
      assert(0);
      return NULL;
  }
}

static uint64_t
operand_val(int operand)
{
  if (operand < 16) {
      return alu_gprs[operand];
  }
  switch (operand) {
    case MI_MATH_OPERAND_SRCA:
      return alu_srca;
    case MI_MATH_OPERAND_SRCB:
      return alu_srcb;
    case MI_MATH_OPERAND_ACCU:
      return alu_accu;
    case MI_MATH_OPERAND_ZF:
      return alu_zf;
    case MI_MATH_OPERAND_CF:
      return alu_cf;
    default:
      fprintf(stderr, "Unknown operand\n");
      assert(0);
      return 0;
  }
}

static void
write_reg(uint32_t reg, uint32_t val)
{
  assert((reg & 0x3) == 0);

  if (reg >= 0x2600 && reg < 0x2600 + 8 * 16) {
      int gpr = reg - 0x2600;

      gpr /= 4;
      ((uint32_t *)alu_gprs)[gpr] = val;

      printf("%50s: %0x=%u\n", "", reg, val);
  } else
    fprintf(stderr, "Unknown register");
}

static int mi_lri_parser_offset;
static int mi_lri_parser_reg;

static void
mi_lri_parse(uint32_t word)
{
  if (mi_lri_parser_offset++ & 1)
    write_reg(mi_lri_parser_reg, word);
  else
    mi_lri_parser_reg = word;
}

static uint32_t
load_reg(uint32_t reg)
{
  assert((reg & 0x3) == 0);

  if (reg >= 0x2600 && reg < 0x2600 + 8 * 16) {
      int gpr = reg - 0x2600;

      gpr /= 4;

      return ((uint32_t *)alu_gprs)[gpr];
  } else {
      fprintf(stderr, "Unknown register");
      return 0;
  }
}

static int mi_lrr_parser_offset;
static int mi_lrr_parser_src_reg;

static void
mi_lrr_parse(uint32_t word)
{
  if (mi_lrr_parser_offset++ & 1) {
      uint32_t val = load_reg(mi_lrr_parser_src_reg);
      write_reg(word, val);
  } else
    mi_lrr_parser_src_reg = word;
}

static void
mi_math_parse(uint32_t word)
{
  uint32_t opcode;
  uint32_t operand0, operand1;

  /* ALU instruction = 20bit opcode : 10bit operand0 : 10bit operand1 */
  opcode = (word & (0xfff << 20)) >> 20;
  operand0 = (word & (0x3ff << 10)) >> 10;
  operand1 = word & 0x3ff;

  switch (opcode) {
    case MI_MATH_OPCODE_LOAD:
      printf("  LOAD %s=%"PRIu64" %s=%"PRIu64"\n",
             operand_name(operand0),
             operand_val(operand0),
             operand_name(operand1),
             operand_val(operand1));
      alu_load(operand0, operand1);
      printf("%50s: %s=%"PRIu64"\n", "",
             operand_name(operand0),
             operand_val(operand0));
      break;
    case MI_MATH_OPCODE_LOADINV:
      printf("  LOADINV %s=%"PRIu64" %s=%"PRIu64"\n",
             operand_name(operand0),
             operand_val(operand0),
             operand_name(operand1),
             operand_val(operand1));
      alu_loadinv(operand0, operand1);
      printf("%50s: %s=%"PRIu64"\n", "",
             operand_name(operand0),
             operand_val(operand0));
      break;
    case MI_MATH_OPCODE_LOAD0:
      printf("  LOAD0 %s=%"PRIu64"\n",
             operand_name(operand0),
             operand_val(operand0));
      alu_load0(operand0);
      printf("%50s: %s=%"PRIu64"\n", "",
             operand_name(operand0),
             operand_val(operand0));
      break;
    case MI_MATH_OPCODE_LOAD1:
      printf("  LOAD1 %s=%"PRIu64"\n",
             operand_name(operand0),
             operand_val(operand0));
      alu_load1(operand0);
      printf("%50s: %s=%"PRIu64"\n", "",
             operand_name(operand0),
             operand_val(operand0));
      break;
    case MI_MATH_OPCODE_STORE:
      printf("  STORE %s=%"PRIu64" %s=%"PRIu64"\n",
             operand_name(operand0),
             operand_val(operand0),
             operand_name(operand1),
             operand_val(operand1));
      alu_store(operand0, operand1);
      printf("%50s: %s=%"PRIu64"\n", "",
             operand_name(operand0),
             operand_val(operand0));
      break;
    case MI_MATH_OPCODE_ADD:
      printf("  ADD (A=%"PRIu64", B=%"PRIu64")\n",
             operand_val(MI_MATH_OPERAND_SRCA),
             operand_val(MI_MATH_OPERAND_SRCB));
      alu_add();
      break;
    case MI_MATH_OPCODE_SUB:
      printf("  SUB (A=%"PRIu64", B=%"PRIu64")\n",
             operand_val(MI_MATH_OPERAND_SRCA),
             operand_val(MI_MATH_OPERAND_SRCB));
      alu_sub();
      break;
    case MI_MATH_OPCODE_AND:
      printf("  AND (A=%"PRIu64", B=%"PRIx64")\n",
             operand_val(MI_MATH_OPERAND_SRCA),
             operand_val(MI_MATH_OPERAND_SRCB));
      alu_and();
      break;
    case MI_MATH_OPCODE_OR:
      printf("  OR (A=%"PRIu64", B=%"PRIu64")\n",
             operand_val(MI_MATH_OPERAND_SRCA),
             operand_val(MI_MATH_OPERAND_SRCB));
      alu_or();
      break;
    case MI_MATH_OPCODE_XOR:
      printf("  XOR (A=%"PRIu64", B=%"PRIx64")\n",
             operand_val(MI_MATH_OPERAND_SRCA),
             operand_val(MI_MATH_OPERAND_SRCB));
      alu_xor();
      break;
  }

  printf("%50s: ACCU=%"PRIu64", CF=%d, ZF=%d\n", "", alu_accu, !!alu_cf, !!alu_zf);
}

static void (*sub_cmd_parser)(uint32_t word);

static void
rcs_parse(uint32_t word)
{
  int engine = word & (0x3 << 29);
  /* number of dwords left before we should pop to prev parser */
  static int cmd_remainder = 0;

  if (!sub_cmd_parser) {
    if (engine == CMD_MI) {

#define MI_OPCODE(X) ((X & (0x3f << 23)) >> 23)

        switch (MI_OPCODE(word)) {
          case MI_OPCODE(MI_LOAD_REGISTER_IMM):
            mi_lri_parser_offset = 0;
            cmd_remainder = (word & 0x7f) + 2;
            printf("MI_LRI(len = %d)\n", cmd_remainder);
            sub_cmd_parser = mi_lri_parse;
            break;
          case MI_OPCODE(MI_LOAD_REGISTER_REG):
            mi_lrr_parser_offset = 0;
            cmd_remainder = (word & 0x7f) + 2;
            printf("MI_LRR(len = %d)\n", cmd_remainder);
            sub_cmd_parser = mi_lrr_parse;
            break;
          case MI_OPCODE(HSW_MI_MATH):
            cmd_remainder = (word & 0x7f) + 2;
            printf("MI_MATH(len = %d)\n", cmd_remainder);
            sub_cmd_parser = mi_math_parse;
            break;
          default:
            fprintf(stderr, "Unhandled MI command\n");
        }

#undef MI_OPCODE
    } else {
        fprintf(stderr, "Only handling MI commands\n");
    }

    cmd_remainder--;
  } else {
    sub_cmd_parser(word);
    if (--cmd_remainder == 0)
      sub_cmd_parser = NULL;
  }
}

static void
brw_load_register_imm64(struct brw_context *brw, uint32_t reg, uint64_t imm)
{
   BEGIN_BATCH(5);
   OUT_BATCH(MI_LOAD_REGISTER_IMM | (5 - 2));
   OUT_BATCH(reg);
   OUT_BATCH(imm & 0xffffffff);
   OUT_BATCH(reg + 4);
   OUT_BATCH(imm >> 32);
   ADVANCE_BATCH();
}

static void
brw_load_register_imm32(struct brw_context *brw, uint32_t reg, uint32_t imm)
{
   BEGIN_BATCH(3);
   OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
   OUT_BATCH(reg);
   OUT_BATCH(imm);
   ADVANCE_BATCH();
}

static void
brw_load_register_reg(struct brw_context *brw, uint32_t dest, uint32_t src)
{
   BEGIN_BATCH(3);
   OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
   OUT_BATCH(src);
   OUT_BATCH(dest);
   ADVANCE_BATCH();
}

static void
alu_logic_op_gpr_u64_with_tmp(struct brw_context *brw,
                              uint32_t op, /* AND, OR, XOR */
                              int operand0_gpr, /* 0-15 */
                              uint64_t operand1_imm,
                              int tmp_gpr) /* 0-15 */
{
   uint32_t maths[] = {
      MI_MATH_ALU2(LOAD, MI_MATH_OPERAND_SRCA, operand0_gpr),
      MI_MATH_ALU2(LOAD, MI_MATH_OPERAND_SRCB, tmp_gpr),
      op << 20,
      MI_MATH_ALU0(AND),
      MI_MATH_ALU2(STORE, operand0_gpr, MI_MATH_OPERAND_ACCU),
   };

   assert(operand0_gpr != tmp_gpr);
   brw_load_register_imm64(brw, HSW_CS_GPR(tmp_gpr), operand1_imm);

   BEGIN_BATCH(1 + ARRAY_SIZE(maths));
   OUT_BATCH(HSW_MI_MATH | (1 + ARRAY_SIZE(maths) - 2));

   for (int m = 0; m < ARRAY_SIZE(maths); m++)
      OUT_BATCH(maths[m]);

   ADVANCE_BATCH();
}

/* left shift any GPR using one other GPR as a temporary */
static void
alu_lshift_gpr_with_tmp(struct brw_context *brw,
                        int gpr,
                        int shift,
                        int gpr_tmp)
{
  uint32_t left_shift[] = {
      MI_MATH_ALU2(LOAD, MI_MATH_OPERAND_SRCA, gpr),
      MI_MATH_ALU2(LOAD, MI_MATH_OPERAND_SRCB, gpr),
      MI_MATH_ALU0(ADD),
      MI_MATH_ALU2(STORE, gpr, MI_MATH_OPERAND_ACCU),
  };
  uint64_t top_mask = (1ULL << (64 - shift)) - 1;
  int max_math_ops = brw->gen >= 9 ? 128 : 32;

  /* XXX: assuming array size is a factor of max ops... */
  int max_shifts = max_math_ops / ARRAY_SIZE(left_shift);

  int n_cmds = (shift + max_shifts - 1) / max_shifts;
  int batch_len = n_cmds * 4 + shift * ARRAY_SIZE(left_shift);

  assert(shift > 0);
  assert(shift < 64);

  /* Copying hsw_queryobj.c idea here and masking out the top
   * bits to avoid overflow.
   *
   * XXX: any reason to really worry about setting CF?
   * TODO: double check details of how ALU handles overflow
   */
  alu_logic_op_gpr_u64_with_tmp(brw,
                                MI_MATH_OPCODE_AND,
                                gpr,
                                top_mask,
                                gpr_tmp);

  BEGIN_BATCH(batch_len);

  while (shift) {
      int n_shifts = min(max_shifts, shift);
      int cmd_len = ARRAY_SIZE(left_shift) * n_shifts + 1;

      OUT_BATCH(HSW_MI_MATH | (cmd_len - 2));

      for (int i = 0; i < n_shifts; i++) {
         for (int m = 0; m < ARRAY_SIZE(left_shift); m++)
            OUT_BATCH(left_shift[m]);
      }

      shift -= n_shifts;
  }

  ADVANCE_BATCH();
}

/* Since the upper and lower 32bit words of the ALU's general purpose registers
 * are addressable via mmio we can implement a right shift in terms of shifting
 * left by (32-n) bits and then keep the upper 32bits.
 *
 * The is limited to shifts < 32 bits
 *
 * In the worst case a rshift by one bit will result in 248 math ops plus
 * headers, three LRRs and an LRI. That sounds like a lot, but still
 * I *guess* the ALU operations being so simple only take one or two
 * clocks each.
 *
 * Totally guessing at the ALU taking maybe 2 cycles per ADD with the
 * GPU running at 500MHz that would be less than a microsecond, so
 * that could still be fine if the work is separate from the
 * application capturing metrics, only done when storing the results.
 *
 * XXX: There are probably common cases where it can be assumed that
 * upper 32bits of the result should be zero and the number of
 * instructions could be more than halved.
 */
static void
alu_rshift_gpr_32_with_2_tmp(struct brw_context *brw,
                             int gpr,
                             int shift,
                             int gpr_tmp0,
                             int gpr_tmp1)
{
  int lshift = 32 - shift;

  assert(shift > 0);
  assert(shift < 32);

  /* First copy the upper 32bits, to be shifted separately */
  brw_load_register_reg(brw, HSW_CS_GPR(gpr_tmp1), HSW_CS_GPR(gpr) + 4);
  brw_load_register_imm32(brw, HSW_CS_GPR(gpr_tmp1) + 4, 0);

  /* Shift the lower 32bits first */
  alu_lshift_gpr_with_tmp(brw, gpr, lshift, gpr_tmp0);
  brw_load_register_reg(brw, HSW_CS_GPR(gpr), HSW_CS_GPR(gpr) + 4);

  /* Shift the upper 32bits */
  alu_lshift_gpr_with_tmp(brw, gpr_tmp1, lshift, gpr_tmp0);
  brw_load_register_reg(brw, HSW_CS_GPR(gpr) + 4, HSW_CS_GPR(gpr_tmp1) + 4);
}

/* Nabbed from src/util/bitscan.h */
static inline unsigned
util_last_bit64(uint64_t u)
{
#if defined(HAVE___BUILTIN_CLZLL)
   return u == 0 ? 0 : 64 - __builtin_clzll(u);
#elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM || _M_IA64)
   unsigned long index;
   if (_BitScanReverse64(&index, u))
      return index + 1;
   else
      return 0;
#else
   unsigned r = 0;
   while (u) {
      r++;
      u >>= 1;
   }
   return r;
#endif
}

/* Approximate an arbitrary multiplication by separating into a
 * converging sum of power-of-two multiplications.
 *
 * Note: this potentially uses all 16 of the ALU registers, depending
 * on the number of bits set in @factor.
 *
 * Note: We don't limit the factor to 16bits since we want to support
 * 48:16 fixed point factors when we need to multiply by a fraction,
 * but in general we only have 16bit of precision for the factor.
 */
static void
alu_mul_gpr0_u64(struct brw_context *brw, uint64_t factor)
{
  int max_math_ops = brw->gen >= 9 ? 128 : 32;
  /* A single lshift is 4 ops: 2 LOADs into A/B, ADD, STORE back */
  int ops_per_lshift = 4;
  int max_shifts = max_math_ops / ops_per_lshift;

  /* We have up 16 GPRs we can use to save intermediate POT multiplications */
#define MAX_STEPS 16

  int pot_shifts[MAX_STEPS + 1]; /* room to zero terminate */
  int n_steps;

  if (factor == 0) {
      BEGIN_BATCH(3 * 4);
      OUT_BATCH(HSW_MI_MATH | (3 - 2));
      OUT_BATCH(MI_MATH_ALU1(LOAD0, MI_MATH_OPERAND_SRCA));
      OUT_BATCH(MI_MATH_ALU2(STORE, R0, MI_MATH_OPERAND_SRCA));
      ADVANCE_BATCH();
      return;
  }

  /* Determine how many separate power of two multiplications to
   * decompose this in to.
   *
   * We scan from most significant bits to the least. If the factor
   * has > MAX_STEPS bits set then the least significant bits will be
   * ignored, making the result less precise.
   */
  for (n_steps = 0; n_steps < MAX_STEPS && factor; n_steps++) {
      pot_shifts[n_steps] = util_last_bit64(factor) - 1;
      factor -= (1 << pot_shifts[n_steps]);
  }

  /* zero terminate for calculating deltas below */
  pot_shifts[n_steps] = 0;

  /* Example states:
   * factor = 81, pot_shifts[0]=6, pot_shifts[1]=4, pot_shifts[2]=0, n_steps = 3
   * factor = 80, pot_shifts[0]=6, pot_shifts[1]=4, n_steps = 2
   * factor = 3,  pot_shifts[0]=1, pot_shifts[1]=0, n_steps = 2
   * factor = 2,  pot_shifts[0]=1, n_steps = 1
   * factor = 1,  pot_shifts[0]=0, n_steps = 1
   */

  /* Starting with the least significant POT factor we shift the
   * original value starting in R0 towards the most significant POT
   * factor.
   *
   * For each step we progress which general purpose register we
   * save the result into, with the first step always loading and
   * saving into R0, then the next step loads from R0 saving to
   * R1, for a total of up to 16 intermediate POT multiplications.
   *
   * Note: the first step might be a factor of one (shift 0), which is
   * a NOOP
   */
  for (int i = pot_shifts[n_steps - 1] ? 0 : 1; i < n_steps; i++) {
      int step_load_gpr = i ? i - 1: i; /* load + store to R0 for first step */
      int step_store_gpr = i;

      /* How far to shift before reaching the next step? */
      int shift = pot_shifts[n_steps - i - 1] - pot_shifts[n_steps - i];

      /* Note: careful to round up here... */
      int n_cmds = (shift + max_shifts - 1) / max_shifts;
      int batch_len = n_cmds * 4 + shift * ops_per_lshift;
      bool first_step_shift = true;

      BEGIN_BATCH(batch_len);

      while (shift) {
          int n_packed_shifts = min(max_shifts, shift);
          int cmd_len = ops_per_lshift * n_packed_shifts + 1;

          OUT_BATCH(HSW_MI_MATH | (cmd_len - 2));

          for (int i = 0; i < n_packed_shifts; i++) {
              int load_gpr = first_step_shift ? step_load_gpr : step_store_gpr;

              OUT_BATCH(MI_MATH_ALU2(LOAD, MI_MATH_OPERAND_SRCA, load_gpr));
              OUT_BATCH(MI_MATH_ALU2(LOAD, MI_MATH_OPERAND_SRCB, load_gpr));
              OUT_BATCH(MI_MATH_ALU0(ADD));
              OUT_BATCH(MI_MATH_ALU2(STORE, step_store_gpr,
                                     MI_MATH_OPERAND_ACCU));

              first_step_shift = false;
          }

          shift -= n_packed_shifts;
      }

      ADVANCE_BATCH();
  }

  /* If the multiplication was split up then sum intermediate values... */
  if (n_steps > 1) {
      int ops_per_add = 4; /* 2 LOADS in to A/B, ADD, STORE back */
      int max_adds = max_math_ops / ops_per_add;
      int adds = n_steps - 1;

      /* Note: careful to round up here... */
      int n_cmds = (adds + max_adds - 1) / max_adds;
      int batch_len = n_cmds * 4 + adds * ops_per_add;

      int saved = R0 + 1;

      BEGIN_BATCH(batch_len);

      while (adds) {
          int n_packed_adds = min(max_adds, adds);
          int cmd_len = ops_per_add * n_packed_adds + 1;

          OUT_BATCH(HSW_MI_MATH | (cmd_len - 2));

          for (int i = 0; i < n_packed_adds; i++) {
              OUT_BATCH(MI_MATH_ALU2(LOAD, MI_MATH_OPERAND_SRCA, R0));
              OUT_BATCH(MI_MATH_ALU2(LOAD, MI_MATH_OPERAND_SRCB, saved));
              OUT_BATCH(MI_MATH_ALU0(ADD));
              OUT_BATCH(MI_MATH_ALU2(STORE, R0, MI_MATH_OPERAND_ACCU));
              saved++;
          }

          adds -= n_packed_adds;
      }

      ADVANCE_BATCH();
  }
}

static void
alu_mul_gpr0_float(struct brw_context *brw, float factor)
{
  assert(factor >= 0);

  if (floorf(factor) != factor) {
      /* If we need to multiply by a floating point factor then
       * scaling by a further 2^16 will effectively result in a
       * 48:16 fixed point value...
       */
      alu_mul_gpr0_u64(brw, factor * 65536);

      /* So now we just need to drop the fixed point fraction */
      alu_rshift_gpr_32_with_2_tmp(brw, R0, 16, R0 + 1, R0 + 2);
  } else
      alu_mul_gpr0_u64(brw, factor);
}

int
main(int argc, char **argv)
{
  alu_gprs[0] = 123; /* load 'timestamp' */

  //alu_mul_gpr0_float(&_brw, 64);
  //alu_mul_gpr0_float(&_brw, 80);
  //alu_mul_gpr0_float(&_brw, 83);
  alu_mul_gpr0_float(&_brw, 83.33);

  printf("Batch length = %d words\n", batch_ptr);
  printf("Initial GPR0 = %"PRIu64"\n", alu_gprs[0]);

  for (int i = 0; i < batch_ptr; i++)
    rcs_parse(batch[i]);

  printf("Final GPR0 = %"PRIu64"\n", alu_gprs[0]);

  return 0;
}