Advertisement
Guest User

General Multiplication with Gen RCS ALU

a guest
Dec 14th, 2016
108
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 24.81 KB | None | 0 0
  1. /*
  2.  * Copyright (c) 2016 Intel Corporation
  3.  *
  4.  * Permission is hereby granted, free of charge, to any person obtaining a
  5.  * copy of this software and associated documentation files (the "Software"),
  6.  * to deal in the Software without restriction, including without limitation
  7.  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8.  * and/or sell copies of the Software, and to permit persons to whom the
  9.  * Software is furnished to do so, subject to the following conditions:
  10.  *
  11.  * The above copyright notice and this permission notice (including the next
  12.  * paragraph) shall be included in all copies or substantial portions of the
  13.  * Software.
  14.  *
  15.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16.  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17.  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18.  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19.  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20.  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21.  * IN THE SOFTWARE.
  22.  *
  23.  *
  24.  * A stand alone test bed for abusing the Gen graphics command
  25.  * streamer ALU
  26.  */
  27.  
  28. #include <stdint.h>
  29. #include <stdbool.h>
  30. #include <stdio.h>
  31. #include <assert.h>
  32. #include <math.h>
  33. #include <inttypes.h>
  34.  
  35. #define max(A, B) ( (A)>(B) ? (A) : (B) )
  36. #define min(A, B) ( (A)<(B) ? (A) : (B) )
  37.  
  38. #define ARRAY_SIZE(X) (sizeof(X)/sizeof(X[0]))
  39.  
  40. #define BEGIN_BATCH(X) do { (void)(X); } while(0)
  41. #define ADVANCE_BATCH(X)
  42. #define OUT_BATCH(X) do { batch[batch_ptr++] = (X); } while(0)
  43.  
  44. #define CMD_MI              (0x0 << 29)
  45.  
  46. #define MI_LOAD_REGISTER_IMM        (CMD_MI | (0x22 << 23))
  47. #define MI_LOAD_REGISTER_REG        (CMD_MI | (0x2A << 23))
  48.  
  49. #define HSW_MI_MATH         (CMD_MI | (0x1a << 23))
  50.  
  51. #define MI_MATH_OPCODE_NOOP      0x000
  52. #define MI_MATH_OPCODE_LOAD      0x080
  53. #define MI_MATH_OPCODE_LOADINV   0x480
  54. #define MI_MATH_OPCODE_LOAD0     0x081
  55. #define MI_MATH_OPCODE_LOAD1     0x481
  56. #define MI_MATH_OPCODE_ADD       0x100
  57. #define MI_MATH_OPCODE_SUB       0x101
  58. #define MI_MATH_OPCODE_AND       0x102
  59. #define MI_MATH_OPCODE_OR        0x103
  60. #define MI_MATH_OPCODE_XOR       0x104
  61. #define MI_MATH_OPCODE_STORE     0x180
  62. #define MI_MATH_OPCODE_STOREINV  0x580
  63.  
  64. #define MI_MATH_OPERAND_R0   0x00
  65. #define MI_MATH_OPERAND_R1   0x01
  66. #define MI_MATH_OPERAND_R2   0x02
  67. #define MI_MATH_OPERAND_R3   0x03
  68. #define MI_MATH_OPERAND_R4   0x04
  69. #define MI_MATH_OPERAND_SRCA 0x20
  70. #define MI_MATH_OPERAND_SRCB 0x21
  71. #define MI_MATH_OPERAND_ACCU 0x31
  72. #define MI_MATH_OPERAND_ZF   0x32
  73. #define MI_MATH_OPERAND_CF   0x33
  74.  
  75. #define MI_MATH_ALU2(opcode, operand1, operand2) \
  76.    ( ((MI_MATH_OPCODE_##opcode) << 20) | (operand1 << 10) | operand2 )
  77.  
  78. #define MI_MATH_ALU1(opcode, operand1) \
  79.    ( ((MI_MATH_OPCODE_##opcode) << 20) | (operand1 << 10) )
  80.  
  81. #define MI_MATH_ALU0(opcode) \
  82.    ( ((MI_MATH_OPCODE_##opcode) << 20) )
  83.  
  84. #define HSW_CS_GPR(n) (0x2600 + (n) * 8)
  85.  
  86. /* Just to improve readability a bit */
  87. #define R0 0
  88.  
  89. static uint32_t batch[1024];
  90. static int batch_ptr;
  91.  
  92. static struct brw_context {
  93.     int gen;
  94. } _brw = {
  95.     .gen = 9
  96. };
  97.  
  98. /*
  99.  * ALU Summary:
  100.  *
  101.  * Two u64 source registers: SRCA/B
  102.  * Sixteen general purpose u64 registers: R0..R15
  103.  * u64 accumulator: ACCU
  104.  * zero and carry flags: ZF, CF
  105.  * OPS: load, store, addition, subtraction, AND, OR, XOR
  106.  */
  107. static uint64_t alu_accu;
  108. static uint64_t alu_srca;
  109. static uint64_t alu_srcb;
  110. static uint64_t alu_zf;
  111. static uint64_t alu_cf;
  112. static uint64_t alu_gprs[16];
  113.  
  114.  
  115. static void
  116. alu_load(int operand0_dst, int operand1_gpr)
  117. {
  118.   if (operand1_gpr < 0 || operand1_gpr >= 16) {
  119.       fprintf(stderr, "invalid ALU GPR\n");
  120.       return;
  121.   }
  122.  
  123.   if (operand0_dst == MI_MATH_OPERAND_SRCA)
  124.     alu_srca = alu_gprs[operand1_gpr];
  125.   else if (operand0_dst == MI_MATH_OPERAND_SRCB)
  126.     alu_srcb = alu_gprs[operand1_gpr];
  127.   else
  128.     fprintf(stderr, "invalid load operand0 destination (should be SRCA or SRCB)\n");
  129. }
  130.  
  131. static void
  132. alu_loadinv(int operand0_dst, int operand1_gpr)
  133. {
  134.   if (operand1_gpr < 0 || operand1_gpr >= 16) {
  135.       fprintf(stderr, "invalid ALU GPR\n");
  136.       return;
  137.   }
  138.  
  139.   if (operand0_dst == MI_MATH_OPERAND_SRCA)
  140.     alu_srca = ~alu_gprs[operand1_gpr];
  141.   else if (operand0_dst == MI_MATH_OPERAND_SRCB)
  142.     alu_srcb = ~alu_gprs[operand1_gpr];
  143.   else
  144.     fprintf(stderr, "invalid load operand0 destination (should be SRCA or SRCB)\n");
  145. }
  146.  
  147. static void
  148. alu_load0(int operand0_dst)
  149. {
  150.   if (operand0_dst == MI_MATH_OPERAND_SRCA)
  151.     alu_srca = 0;
  152.   else if (operand0_dst == MI_MATH_OPERAND_SRCB)
  153.     alu_srcb = 0;
  154.   else
  155.     fprintf(stderr, "invalid load0 operand0 destination (should be SRCA or SRCB)\n");
  156. }
  157.  
  158. static void
  159. alu_load1(int operand0_dst)
  160. {
  161.   if (operand0_dst == MI_MATH_OPERAND_SRCA)
  162.     alu_srca = 1;
  163.   else if (operand0_dst == MI_MATH_OPERAND_SRCB)
  164.     alu_srcb = 1;
  165.   else
  166.     fprintf(stderr, "invalid load1 operand0 destination (should be SRCA or SRCB)\n");
  167. }
  168.  
  169. static void
  170. alu_store(int operand0_gpr_dst,
  171.           int operand1_src)
  172. {
  173.   if (operand0_gpr_dst < 0 || operand0_gpr_dst >= 16) {
  174.       fprintf(stderr, "invalid ALU GPR\n");
  175.       return;
  176.   }
  177.  
  178.   switch (operand1_src) {
  179.     case MI_MATH_OPERAND_ACCU:
  180.       alu_gprs[operand0_gpr_dst] = alu_accu;
  181.       break;
  182.     case MI_MATH_OPERAND_ZF:
  183.       alu_gprs[operand0_gpr_dst] = alu_zf ? ~0ULL : 0ULL;
  184.       break;
  185.     case MI_MATH_OPERAND_CF:
  186.       alu_gprs[operand0_gpr_dst] = alu_cf ? ~0ULL : 0ULL;
  187.       break;
  188.     default:
  189.       fprintf(stderr, "invalid store operand (should be ACCU, ZF or CF)\n");
  190.   }
  191. }
  192.  
  193. /* Implement as cascaded 32bit adds so we can easily handle the add with carry */
  194. static void
  195. alu_add(void)
  196. {
  197.   uint64_t lower = (alu_srca & 0xffffffff) + (alu_srcb & 0xffffffff);
  198.   uint64_t carry = lower > UINT32_MAX ? 1: 0;
  199.   uint64_t upper = (alu_srca >> 32) +
  200.                    (alu_srcb >> 32) +
  201.                    carry;
  202.  
  203.   alu_cf = upper > UINT32_MAX ? 1: 0;
  204.   alu_accu = (upper << 32) | lower;
  205.   alu_zf = !alu_accu;
  206. }
  207.  
  208. /* FIXME: handle carry */
  209. static void
  210. alu_sub(void)
  211. {
  212.   alu_cf = 0;
  213.   alu_accu = alu_srca - alu_srcb;
  214.   alu_zf = !alu_accu;
  215. }
  216.  
  217. static void
  218. alu_and(void)
  219. {
  220.   alu_accu = alu_srca & alu_srcb;
  221.   alu_zf = !alu_accu;
  222. }
  223.  
  224. static void
  225. alu_or(void)
  226. {
  227.   alu_accu = alu_srca | alu_srcb;
  228.   alu_zf = !alu_accu;
  229. }
  230.  
  231. static void
  232. alu_xor(void)
  233. {
  234.   alu_accu = alu_srca ^ alu_srcb;
  235.   alu_zf = !alu_accu;
  236. }
  237.  
  238. static const char *
  239. operand_name(int operand)
  240. {
  241.   static const char *gpr_names[] = {
  242.       "R0",
  243.       "R1",
  244.       "R2",
  245.       "R3",
  246.       "R4",
  247.       "R5",
  248.       "R6",
  249.       "R7",
  250.       "R8",
  251.       "R9",
  252.       "R10",
  253.       "R11",
  254.       "R12",
  255.       "R13",
  256.       "R14",
  257.       "R15",
  258.   };
  259.  
  260.   if (operand < 16) {
  261.       return gpr_names[operand];
  262.   }
  263.   switch (operand) {
  264.     case MI_MATH_OPERAND_SRCA:
  265.       return "SRCA";
  266.     case MI_MATH_OPERAND_SRCB:
  267.       return "SRCB";
  268.     case MI_MATH_OPERAND_ACCU:
  269.       return "ACCU";
  270.     case MI_MATH_OPERAND_ZF:
  271.       return "ZF";
  272.     case MI_MATH_OPERAND_CF:
  273.       return "CF";
  274.     default:
  275.       fprintf(stderr, "Unknown operand\n");
  276.       assert(0);
  277.       return NULL;
  278.   }
  279. }
  280.  
  281. static uint64_t
  282. operand_val(int operand)
  283. {
  284.   if (operand < 16) {
  285.       return alu_gprs[operand];
  286.   }
  287.   switch (operand) {
  288.     case MI_MATH_OPERAND_SRCA:
  289.       return alu_srca;
  290.     case MI_MATH_OPERAND_SRCB:
  291.       return alu_srcb;
  292.     case MI_MATH_OPERAND_ACCU:
  293.       return alu_accu;
  294.     case MI_MATH_OPERAND_ZF:
  295.       return alu_zf;
  296.     case MI_MATH_OPERAND_CF:
  297.       return alu_cf;
  298.     default:
  299.       fprintf(stderr, "Unknown operand\n");
  300.       assert(0);
  301.       return 0;
  302.   }
  303. }
  304.  
  305. static void
  306. write_reg(uint32_t reg, uint32_t val)
  307. {
  308.   assert((reg & 0x3) == 0);
  309.  
  310.   if (reg >= 0x2600 && reg < 0x2600 + 8 * 16) {
  311.       int gpr = reg - 0x2600;
  312.  
  313.       gpr /= 4;
  314.       ((uint32_t *)alu_gprs)[gpr] = val;
  315.  
  316.       printf("%50s: %0x=%u\n", "", reg, val);
  317.   } else
  318.     fprintf(stderr, "Unknown register");
  319. }
  320.  
  321. static int mi_lri_parser_offset;
  322. static int mi_lri_parser_reg;
  323.  
  324. static void
  325. mi_lri_parse(uint32_t word)
  326. {
  327.   if (mi_lri_parser_offset++ & 1)
  328.     write_reg(mi_lri_parser_reg, word);
  329.   else
  330.     mi_lri_parser_reg = word;
  331. }
  332.  
  333. static uint32_t
  334. load_reg(uint32_t reg)
  335. {
  336.   assert((reg & 0x3) == 0);
  337.  
  338.   if (reg >= 0x2600 && reg < 0x2600 + 8 * 16) {
  339.       int gpr = reg - 0x2600;
  340.  
  341.       gpr /= 4;
  342.  
  343.       return ((uint32_t *)alu_gprs)[gpr];
  344.   } else {
  345.       fprintf(stderr, "Unknown register");
  346.       return 0;
  347.   }
  348. }
  349.  
  350. static int mi_lrr_parser_offset;
  351. static int mi_lrr_parser_src_reg;
  352.  
  353. static void
  354. mi_lrr_parse(uint32_t word)
  355. {
  356.   if (mi_lrr_parser_offset++ & 1) {
  357.       uint32_t val = load_reg(mi_lrr_parser_src_reg);
  358.       write_reg(word, val);
  359.   } else
  360.     mi_lrr_parser_src_reg = word;
  361. }
  362.  
  363. static void
  364. mi_math_parse(uint32_t word)
  365. {
  366.   uint32_t opcode;
  367.   uint32_t operand0, operand1;
  368.  
  369.   /* ALU instruction = 20bit opcode : 10bit operand0 : 10bit operand1 */
  370.   opcode = (word & (0xfff << 20)) >> 20;
  371.   operand0 = (word & (0x3ff << 10)) >> 10;
  372.   operand1 = word & 0x3ff;
  373.  
  374.   switch (opcode) {
  375.     case MI_MATH_OPCODE_LOAD:
  376.       printf("  LOAD %s=%"PRIu64" %s=%"PRIu64"\n",
  377.              operand_name(operand0),
  378.              operand_val(operand0),
  379.              operand_name(operand1),
  380.              operand_val(operand1));
  381.       alu_load(operand0, operand1);
  382.       printf("%50s: %s=%"PRIu64"\n", "",
  383.              operand_name(operand0),
  384.              operand_val(operand0));
  385.       break;
  386.     case MI_MATH_OPCODE_LOADINV:
  387.       printf("  LOADINV %s=%"PRIu64" %s=%"PRIu64"\n",
  388.              operand_name(operand0),
  389.              operand_val(operand0),
  390.              operand_name(operand1),
  391.              operand_val(operand1));
  392.       alu_loadinv(operand0, operand1);
  393.       printf("%50s: %s=%"PRIu64"\n", "",
  394.              operand_name(operand0),
  395.              operand_val(operand0));
  396.       break;
  397.     case MI_MATH_OPCODE_LOAD0:
  398.       printf("  LOAD0 %s=%"PRIu64"\n",
  399.              operand_name(operand0),
  400.              operand_val(operand0));
  401.       alu_load0(operand0);
  402.       printf("%50s: %s=%"PRIu64"\n", "",
  403.              operand_name(operand0),
  404.              operand_val(operand0));
  405.       break;
  406.     case MI_MATH_OPCODE_LOAD1:
  407.       printf("  LOAD1 %s=%"PRIu64"\n",
  408.              operand_name(operand0),
  409.              operand_val(operand0));
  410.       alu_load1(operand0);
  411.       printf("%50s: %s=%"PRIu64"\n", "",
  412.              operand_name(operand0),
  413.              operand_val(operand0));
  414.       break;
  415.     case MI_MATH_OPCODE_STORE:
  416.       printf("  STORE %s=%"PRIu64" %s=%"PRIu64"\n",
  417.              operand_name(operand0),
  418.              operand_val(operand0),
  419.              operand_name(operand1),
  420.              operand_val(operand1));
  421.       alu_store(operand0, operand1);
  422.       printf("%50s: %s=%"PRIu64"\n", "",
  423.              operand_name(operand0),
  424.              operand_val(operand0));
  425.       break;
  426.     case MI_MATH_OPCODE_ADD:
  427.       printf("  ADD (A=%"PRIu64", B=%"PRIu64")\n",
  428.              operand_val(MI_MATH_OPERAND_SRCA),
  429.              operand_val(MI_MATH_OPERAND_SRCB));
  430.       alu_add();
  431.       break;
  432.     case MI_MATH_OPCODE_SUB:
  433.       printf("  SUB (A=%"PRIu64", B=%"PRIu64")\n",
  434.              operand_val(MI_MATH_OPERAND_SRCA),
  435.              operand_val(MI_MATH_OPERAND_SRCB));
  436.       alu_sub();
  437.       break;
  438.     case MI_MATH_OPCODE_AND:
  439.       printf("  AND (A=%"PRIu64", B=%"PRIx64")\n",
  440.              operand_val(MI_MATH_OPERAND_SRCA),
  441.              operand_val(MI_MATH_OPERAND_SRCB));
  442.       alu_and();
  443.       break;
  444.     case MI_MATH_OPCODE_OR:
  445.       printf("  OR (A=%"PRIu64", B=%"PRIu64")\n",
  446.              operand_val(MI_MATH_OPERAND_SRCA),
  447.              operand_val(MI_MATH_OPERAND_SRCB));
  448.       alu_or();
  449.       break;
  450.     case MI_MATH_OPCODE_XOR:
  451.       printf("  XOR (A=%"PRIu64", B=%"PRIx64")\n",
  452.              operand_val(MI_MATH_OPERAND_SRCA),
  453.              operand_val(MI_MATH_OPERAND_SRCB));
  454.       alu_xor();
  455.       break;
  456.   }
  457.  
  458.   printf("%50s: ACCU=%"PRIu64", CF=%d, ZF=%d\n", "", alu_accu, !!alu_cf, !!alu_zf);
  459. }
  460.  
  461. static void (*sub_cmd_parser)(uint32_t word);
  462.  
  463. static void
  464. rcs_parse(uint32_t word)
  465. {
  466.   int engine = word & (0x3 << 29);
  467.   /* number of dwords left before we should pop to prev parser */
  468.   static int cmd_remainder = 0;
  469.  
  470.   if (!sub_cmd_parser) {
  471.     if (engine == CMD_MI) {
  472.  
  473. #define MI_OPCODE(X) ((X & (0x3f << 23)) >> 23)
  474.  
  475.         switch (MI_OPCODE(word)) {
  476.           case MI_OPCODE(MI_LOAD_REGISTER_IMM):
  477.             mi_lri_parser_offset = 0;
  478.             cmd_remainder = (word & 0x7f) + 2;
  479.             printf("MI_LRI(len = %d)\n", cmd_remainder);
  480.             sub_cmd_parser = mi_lri_parse;
  481.             break;
  482.           case MI_OPCODE(MI_LOAD_REGISTER_REG):
  483.             mi_lrr_parser_offset = 0;
  484.             cmd_remainder = (word & 0x7f) + 2;
  485.             printf("MI_LRR(len = %d)\n", cmd_remainder);
  486.             sub_cmd_parser = mi_lrr_parse;
  487.             break;
  488.           case MI_OPCODE(HSW_MI_MATH):
  489.             cmd_remainder = (word & 0x7f) + 2;
  490.             printf("MI_MATH(len = %d)\n", cmd_remainder);
  491.             sub_cmd_parser = mi_math_parse;
  492.             break;
  493.           default:
  494.             fprintf(stderr, "Unhandled MI command\n");
  495.         }
  496.  
  497. #undef MI_OPCODE
  498.     } else {
  499.         fprintf(stderr, "Only handling MI commands\n");
  500.     }
  501.  
  502.     cmd_remainder--;
  503.   } else {
  504.     sub_cmd_parser(word);
  505.     if (--cmd_remainder == 0)
  506.       sub_cmd_parser = NULL;
  507.   }
  508. }
  509.  
  510. static void
  511. brw_load_register_imm64(struct brw_context *brw, uint32_t reg, uint64_t imm)
  512. {
  513.    BEGIN_BATCH(5);
  514.    OUT_BATCH(MI_LOAD_REGISTER_IMM | (5 - 2));
  515.    OUT_BATCH(reg);
  516.    OUT_BATCH(imm & 0xffffffff);
  517.    OUT_BATCH(reg + 4);
  518.    OUT_BATCH(imm >> 32);
  519.    ADVANCE_BATCH();
  520. }
  521.  
  522. static void
  523. brw_load_register_imm32(struct brw_context *brw, uint32_t reg, uint32_t imm)
  524. {
  525.    BEGIN_BATCH(3);
  526.    OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
  527.    OUT_BATCH(reg);
  528.    OUT_BATCH(imm);
  529.    ADVANCE_BATCH();
  530. }
  531.  
  532. static void
  533. brw_load_register_reg(struct brw_context *brw, uint32_t dest, uint32_t src)
  534. {
  535.    BEGIN_BATCH(3);
  536.    OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
  537.    OUT_BATCH(src);
  538.    OUT_BATCH(dest);
  539.    ADVANCE_BATCH();
  540. }
  541.  
  542. static void
  543. alu_logic_op_gpr_u64_with_tmp(struct brw_context *brw,
  544.                               uint32_t op, /* AND, OR, XOR */
  545.                               int operand0_gpr, /* 0-15 */
  546.                               uint64_t operand1_imm,
  547.                               int tmp_gpr) /* 0-15 */
  548. {
  549.    uint32_t maths[] = {
  550.       MI_MATH_ALU2(LOAD, MI_MATH_OPERAND_SRCA, operand0_gpr),
  551.       MI_MATH_ALU2(LOAD, MI_MATH_OPERAND_SRCB, tmp_gpr),
  552.       op << 20,
  553.       MI_MATH_ALU0(AND),
  554.       MI_MATH_ALU2(STORE, operand0_gpr, MI_MATH_OPERAND_ACCU),
  555.    };
  556.  
  557.    assert(operand0_gpr != tmp_gpr);
  558.    brw_load_register_imm64(brw, HSW_CS_GPR(tmp_gpr), operand1_imm);
  559.  
  560.    BEGIN_BATCH(1 + ARRAY_SIZE(maths));
  561.    OUT_BATCH(HSW_MI_MATH | (1 + ARRAY_SIZE(maths) - 2));
  562.  
  563.    for (int m = 0; m < ARRAY_SIZE(maths); m++)
  564.       OUT_BATCH(maths[m]);
  565.  
  566.    ADVANCE_BATCH();
  567. }
  568.  
  569. /* left shift any GPR using one other GPR as a temporary */
  570. static void
  571. alu_lshift_gpr_with_tmp(struct brw_context *brw,
  572.                         int gpr,
  573.                         int shift,
  574.                         int gpr_tmp)
  575. {
  576.   uint32_t left_shift[] = {
  577.       MI_MATH_ALU2(LOAD, MI_MATH_OPERAND_SRCA, gpr),
  578.       MI_MATH_ALU2(LOAD, MI_MATH_OPERAND_SRCB, gpr),
  579.       MI_MATH_ALU0(ADD),
  580.       MI_MATH_ALU2(STORE, gpr, MI_MATH_OPERAND_ACCU),
  581.   };
  582.   uint64_t top_mask = (1ULL << (64 - shift)) - 1;
  583.   int max_math_ops = brw->gen >= 9 ? 128 : 32;
  584.  
  585.   /* XXX: assuming array size is a factor of max ops... */
  586.   int max_shifts = max_math_ops / ARRAY_SIZE(left_shift);
  587.  
  588.   int n_cmds = (shift + max_shifts - 1) / max_shifts;
  589.   int batch_len = n_cmds * 4 + shift * ARRAY_SIZE(left_shift);
  590.  
  591.   assert(shift > 0);
  592.   assert(shift < 64);
  593.  
  594.   /* Copying hsw_queryobj.c idea here and masking out the top
  595.    * bits to avoid overflow.
  596.    *
  597.    * XXX: any reason to really worry about setting CF?
  598.    * TODO: double check details of how ALU handles overflow
  599.    */
  600.   alu_logic_op_gpr_u64_with_tmp(brw,
  601.                                 MI_MATH_OPCODE_AND,
  602.                                 gpr,
  603.                                 top_mask,
  604.                                 gpr_tmp);
  605.  
  606.   BEGIN_BATCH(batch_len);
  607.  
  608.   while (shift) {
  609.       int n_shifts = min(max_shifts, shift);
  610.       int cmd_len = ARRAY_SIZE(left_shift) * n_shifts + 1;
  611.  
  612.       OUT_BATCH(HSW_MI_MATH | (cmd_len - 2));
  613.  
  614.       for (int i = 0; i < n_shifts; i++) {
  615.          for (int m = 0; m < ARRAY_SIZE(left_shift); m++)
  616.             OUT_BATCH(left_shift[m]);
  617.       }
  618.  
  619.       shift -= n_shifts;
  620.   }
  621.  
  622.   ADVANCE_BATCH();
  623. }
  624.  
  625. /* Since the upper and lower 32bit words of the ALU's general purpose registers
  626.  * are addressable via mmio we can implement a right shift in terms of shifting
  627.  * left by (32-n) bits and then keep the upper 32bits.
  628.  *
  629.  * The is limited to shifts < 32 bits
  630.  *
  631.  * In the worst case a rshift by one bit will result in 248 math ops plus
  632.  * headers, three LRRs and an LRI. That sounds like a lot, but still
  633.  * I *guess* the ALU operations being so simple only take one or two
  634.  * clocks each.
  635.  *
  636.  * Totally guessing at the ALU taking maybe 2 cycles per ADD with the
  637.  * GPU running at 500MHz that would be less than a microsecond, so
  638.  * that could still be fine if the work is separate from the
  639.  * application capturing metrics, only done when storing the results.
  640.  *
  641.  * XXX: There are probably common cases where it can be assumed that
  642.  * upper 32bits of the result should be zero and the number of
  643.  * instructions could be more than halved.
  644.  */
  645. static void
  646. alu_rshift_gpr_32_with_2_tmp(struct brw_context *brw,
  647.                              int gpr,
  648.                              int shift,
  649.                              int gpr_tmp0,
  650.                              int gpr_tmp1)
  651. {
  652.   int lshift = 32 - shift;
  653.  
  654.   assert(shift > 0);
  655.   assert(shift < 32);
  656.  
  657.   /* First copy the upper 32bits, to be shifted separately */
  658.   brw_load_register_reg(brw, HSW_CS_GPR(gpr_tmp1), HSW_CS_GPR(gpr) + 4);
  659.   brw_load_register_imm32(brw, HSW_CS_GPR(gpr_tmp1) + 4, 0);
  660.  
  661.   /* Shift the lower 32bits first */
  662.   alu_lshift_gpr_with_tmp(brw, gpr, lshift, gpr_tmp0);
  663.   brw_load_register_reg(brw, HSW_CS_GPR(gpr), HSW_CS_GPR(gpr) + 4);
  664.  
  665.   /* Shift the upper 32bits */
  666.   alu_lshift_gpr_with_tmp(brw, gpr_tmp1, lshift, gpr_tmp0);
  667.   brw_load_register_reg(brw, HSW_CS_GPR(gpr) + 4, HSW_CS_GPR(gpr_tmp1) + 4);
  668. }
  669.  
  670. /* Nabbed from src/util/bitscan.h */
  671. static inline unsigned
  672. util_last_bit64(uint64_t u)
  673. {
  674. #if defined(HAVE___BUILTIN_CLZLL)
  675.    return u == 0 ? 0 : 64 - __builtin_clzll(u);
  676. #elif defined(_MSC_VER) && (_M_AMD64 || _M_ARM || _M_IA64)
  677.    unsigned long index;
  678.    if (_BitScanReverse64(&index, u))
  679.       return index + 1;
  680.    else
  681.       return 0;
  682. #else
  683.    unsigned r = 0;
  684.    while (u) {
  685.       r++;
  686.       u >>= 1;
  687.    }
  688.    return r;
  689. #endif
  690. }
  691.  
  692. /* Approximate an arbitrary multiplication by separating into a
  693.  * converging sum of power-of-two multiplications.
  694.  *
  695.  * Note: this potentially uses all 16 of the ALU registers, depending
  696.  * on the number of bits set in @factor.
  697.  *
  698.  * Note: We don't limit the factor to 16bits since we want to support
  699.  * 48:16 fixed point factors when we need to multiply by a fraction,
  700.  * but in general we only have 16bit of precision for the factor.
  701.  */
  702. static void
  703. alu_mul_gpr0_u64(struct brw_context *brw, uint64_t factor)
  704. {
  705.   int max_math_ops = brw->gen >= 9 ? 128 : 32;
  706.   /* A single lshift is 4 ops: 2 LOADs into A/B, ADD, STORE back */
  707.   int ops_per_lshift = 4;
  708.   int max_shifts = max_math_ops / ops_per_lshift;
  709.  
  710.   /* We have up 16 GPRs we can use to save intermediate POT multiplications */
  711. #define MAX_STEPS 16
  712.  
  713.   int pot_shifts[MAX_STEPS + 1]; /* room to zero terminate */
  714.   int n_steps;
  715.  
  716.   if (factor == 0) {
  717.       BEGIN_BATCH(3 * 4);
  718.       OUT_BATCH(HSW_MI_MATH | (3 - 2));
  719.       OUT_BATCH(MI_MATH_ALU1(LOAD0, MI_MATH_OPERAND_SRCA));
  720.       OUT_BATCH(MI_MATH_ALU2(STORE, R0, MI_MATH_OPERAND_SRCA));
  721.       ADVANCE_BATCH();
  722.       return;
  723.   }
  724.  
  725.   /* Determine how many separate power of two multiplications to
  726.    * decompose this in to.
  727.    *
  728.    * We scan from most significant bits to the least. If the factor
  729.    * has > MAX_STEPS bits set then the least significant bits will be
  730.    * ignored, making the result less precise.
  731.    */
  732.   for (n_steps = 0; n_steps < MAX_STEPS && factor; n_steps++) {
  733.       pot_shifts[n_steps] = util_last_bit64(factor) - 1;
  734.       factor -= (1 << pot_shifts[n_steps]);
  735.   }
  736.  
  737.   /* zero terminate for calculating deltas below */
  738.   pot_shifts[n_steps] = 0;
  739.  
  740.   /* Example states:
  741.    * factor = 81, pot_shifts[0]=6, pot_shifts[1]=4, pot_shifts[2]=0, n_steps = 3
  742.    * factor = 80, pot_shifts[0]=6, pot_shifts[1]=4, n_steps = 2
  743.    * factor = 3,  pot_shifts[0]=1, pot_shifts[1]=0, n_steps = 2
  744.    * factor = 2,  pot_shifts[0]=1, n_steps = 1
  745.    * factor = 1,  pot_shifts[0]=0, n_steps = 1
  746.    */
  747.  
  748.   /* Starting with the least significant POT factor we shift the
  749.    * original value starting in R0 towards the most significant POT
  750.    * factor.
  751.    *
  752.    * For each step we progress which general purpose register we
  753.    * save the result into, with the first step always loading and
  754.    * saving into R0, then the next step loads from R0 saving to
  755.    * R1, for a total of up to 16 intermediate POT multiplications.
  756.    *
  757.    * Note: the first step might be a factor of one (shift 0), which is
  758.    * a NOOP
  759.    */
  760.   for (int i = pot_shifts[n_steps - 1] ? 0 : 1; i < n_steps; i++) {
  761.       int step_load_gpr = i ? i - 1: i; /* load + store to R0 for first step */
  762.       int step_store_gpr = i;
  763.  
  764.       /* How far to shift before reaching the next step? */
  765.       int shift = pot_shifts[n_steps - i - 1] - pot_shifts[n_steps - i];
  766.  
  767.       /* Note: careful to round up here... */
  768.       int n_cmds = (shift + max_shifts - 1) / max_shifts;
  769.       int batch_len = n_cmds * 4 + shift * ops_per_lshift;
  770.       bool first_step_shift = true;
  771.  
  772.       BEGIN_BATCH(batch_len);
  773.  
  774.       while (shift) {
  775.           int n_packed_shifts = min(max_shifts, shift);
  776.           int cmd_len = ops_per_lshift * n_packed_shifts + 1;
  777.  
  778.           OUT_BATCH(HSW_MI_MATH | (cmd_len - 2));
  779.  
  780.           for (int i = 0; i < n_packed_shifts; i++) {
  781.               int load_gpr = first_step_shift ? step_load_gpr : step_store_gpr;
  782.  
  783.               OUT_BATCH(MI_MATH_ALU2(LOAD, MI_MATH_OPERAND_SRCA, load_gpr));
  784.               OUT_BATCH(MI_MATH_ALU2(LOAD, MI_MATH_OPERAND_SRCB, load_gpr));
  785.               OUT_BATCH(MI_MATH_ALU0(ADD));
  786.               OUT_BATCH(MI_MATH_ALU2(STORE, step_store_gpr,
  787.                                      MI_MATH_OPERAND_ACCU));
  788.  
  789.               first_step_shift = false;
  790.           }
  791.  
  792.           shift -= n_packed_shifts;
  793.       }
  794.  
  795.       ADVANCE_BATCH();
  796.   }
  797.  
  798.   /* If the multiplication was split up then sum intermediate values... */
  799.   if (n_steps > 1) {
  800.       int ops_per_add = 4; /* 2 LOADS in to A/B, ADD, STORE back */
  801.       int max_adds = max_math_ops / ops_per_add;
  802.       int adds = n_steps - 1;
  803.  
  804.       /* Note: careful to round up here... */
  805.       int n_cmds = (adds + max_adds - 1) / max_adds;
  806.       int batch_len = n_cmds * 4 + adds * ops_per_add;
  807.  
  808.       int saved = R0 + 1;
  809.  
  810.       BEGIN_BATCH(batch_len);
  811.  
  812.       while (adds) {
  813.           int n_packed_adds = min(max_adds, adds);
  814.           int cmd_len = ops_per_add * n_packed_adds + 1;
  815.  
  816.           OUT_BATCH(HSW_MI_MATH | (cmd_len - 2));
  817.  
  818.           for (int i = 0; i < n_packed_adds; i++) {
  819.               OUT_BATCH(MI_MATH_ALU2(LOAD, MI_MATH_OPERAND_SRCA, R0));
  820.               OUT_BATCH(MI_MATH_ALU2(LOAD, MI_MATH_OPERAND_SRCB, saved));
  821.               OUT_BATCH(MI_MATH_ALU0(ADD));
  822.               OUT_BATCH(MI_MATH_ALU2(STORE, R0, MI_MATH_OPERAND_ACCU));
  823.               saved++;
  824.           }
  825.  
  826.           adds -= n_packed_adds;
  827.       }
  828.  
  829.       ADVANCE_BATCH();
  830.   }
  831. }
  832.  
  833. static void
  834. alu_mul_gpr0_float(struct brw_context *brw, float factor)
  835. {
  836.   assert(factor >= 0);
  837.  
  838.   if (floorf(factor) != factor) {
  839.       /* If we need to multiply by a floating point factor then
  840.        * scaling by a further 2^16 will effectively result in a
  841.        * 48:16 fixed point value...
  842.        */
  843.       alu_mul_gpr0_u64(brw, factor * 65536);
  844.  
  845.       /* So now we just need to drop the fixed point fraction */
  846.       alu_rshift_gpr_32_with_2_tmp(brw, R0, 16, R0 + 1, R0 + 2);
  847.   } else
  848.       alu_mul_gpr0_u64(brw, factor);
  849. }
  850.  
  851. int
  852. main(int argc, char **argv)
  853. {
  854.   alu_gprs[0] = 123; /* load 'timestamp' */
  855.  
  856.   //alu_mul_gpr0_float(&_brw, 64);
  857.   //alu_mul_gpr0_float(&_brw, 80);
  858.   //alu_mul_gpr0_float(&_brw, 83);
  859.   alu_mul_gpr0_float(&_brw, 83.33);
  860.  
  861.   printf("Batch length = %d words\n", batch_ptr);
  862.   printf("Initial GPR0 = %"PRIu64"\n", alu_gprs[0]);
  863.  
  864.   for (int i = 0; i < batch_ptr; i++)
  865.     rcs_parse(batch[i]);
  866.  
  867.   printf("Final GPR0 = %"PRIu64"\n", alu_gprs[0]);
  868.  
  869.   return 0;
  870. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement