Advertisement
Guest User

diff

a guest
Feb 8th, 2017
193
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Diff 137.69 KB | None | 0 0
  1. commit 160a4208f8907e6f6f8d7f2877214c2524318b80
  2. Author: Suici Doga <suiciwd@gmail.com>
  3. Date:   Wed Feb 8 18:10:32 2017 +0530
  4.  
  5.     WIP
  6.  
  7. diff --git a/src/gallium/drivers/r600/Makefile.sources b/src/gallium/drivers/r600/Makefile.sources
  8. index df083d7..e0083e2 100644
  9. --- a/src/gallium/drivers/r600/Makefile.sources
  10. +++ b/src/gallium/drivers/r600/Makefile.sources
  11. @@ -45,6 +45,7 @@ CXX_SOURCES = \
  12.     sb/sb_sched.cpp \
  13.     sb/sb_shader.cpp \
  14.     sb/sb_ssa_builder.cpp \
  15. +   sb/sb_tgsi.cpp \
  16.     sb/sb_valtable.cpp
  17.  
  18.  LLVM_C_SOURCES = r600_llvm.c
  19. diff --git a/src/gallium/drivers/r600/r600_isa.h b/src/gallium/drivers/r600/r600_isa.h
  20. index c6bb869..504ef42 100644
  21. --- a/src/gallium/drivers/r600/r600_isa.h
  22. +++ b/src/gallium/drivers/r600/r600_isa.h
  23. @@ -42,6 +42,10 @@ enum alu_op_flags
  24.     AF_4V       = (AF_V | AF_4SLOT),
  25.     AF_VS       = (AF_V | AF_S),     /* allowed in any slot */
  26.  
  27. +   // MULLO_INT, MULHI_INT (and _UINT) should be expanded to 4 slots
  28. +   AF_CM_EXPAND = (1<<3),
  29. +   AF_4VE      = (AF_4V | AF_CM_EXPAND),
  30. +
  31.     AF_KILL     = (1<<4),
  32.     AF_PRED     = (1<<5),
  33.     AF_SET      = (1<<6),
  34. @@ -285,10 +289,10 @@ static const struct alu_op_info alu_op_table[] = {
  35.         {"SQRT_IEEE",                 1, { 0x6A, 0x8A },{   AF_S,  AF_S,  AF_S,  AF_S},  AF_IEEE },
  36.         {"SIN",                       1, { 0x6E, 0x8D },{   AF_S,  AF_S,  AF_S,  AF_S},  0 },
  37.         {"COS",                       1, { 0x6F, 0x8E },{   AF_S,  AF_S,  AF_S,  AF_S},  0 },
  38. -       {"MULLO_INT",                 2, { 0x73, 0x8F },{   AF_S,  AF_S,  AF_S,  AF_4V},  AF_M_COMM | AF_INT_DST | AF_REPL},
  39. -       {"MULHI_INT",                 2, { 0x74, 0x90 },{   AF_S,  AF_S,  AF_S,  AF_4V},  AF_M_COMM | AF_INT_DST | AF_REPL},
  40. -       {"MULLO_UINT",                2, { 0x75, 0x91 },{   AF_S,  AF_S,  AF_S,  AF_4V},  AF_M_COMM | AF_UINT_DST | AF_REPL},
  41. -       {"MULHI_UINT",                2, { 0x76, 0x92 },{   AF_S,  AF_S,  AF_S,  AF_4V},  AF_M_COMM | AF_UINT_DST | AF_REPL},
  42. +       {"MULLO_INT",                 2, { 0x73, 0x8F },{   AF_S,  AF_S,  AF_S,  AF_4VE},  AF_M_COMM | AF_INT_DST | AF_REPL},
  43. +       {"MULHI_INT",                 2, { 0x74, 0x90 },{   AF_S,  AF_S,  AF_S,  AF_4VE},  AF_M_COMM | AF_INT_DST | AF_REPL},
  44. +       {"MULLO_UINT",                2, { 0x75, 0x91 },{   AF_S,  AF_S,  AF_S,  AF_4VE},  AF_M_COMM | AF_UINT_DST | AF_REPL},
  45. +       {"MULHI_UINT",                2, { 0x76, 0x92 },{   AF_S,  AF_S,  AF_S,  AF_4VE},  AF_M_COMM | AF_UINT_DST | AF_REPL},
  46.         {"RECIP_INT",                 1, { 0x77, 0x93 },{   AF_S,  AF_S,  AF_S,     0},  AF_INT_DST },
  47.         {"RECIP_UINT",                1, { 0x78, 0x94 },{   AF_S,  AF_S,  AF_S,     0},  AF_UINT_DST },
  48.         {"RECIP_64",                  2, {   -1, 0x95 },{      0,     0,  AF_S,  AF_S},  AF_64 },
  49. diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
  50. index 49abf50..4006b7a 100644
  51. --- a/src/gallium/drivers/r600/r600_pipe.c
  52. +++ b/src/gallium/drivers/r600/r600_pipe.c
  53. @@ -75,6 +75,7 @@ static const struct debug_named_value debug_options[] = {
  54.     { "sbnofallback", DBG_SB_NO_FALLBACK, "Abort on errors instead of fallback" },
  55.     { "sbdisasm", DBG_SB_DISASM, "Use sb disassembler for shader dumps" },
  56.     { "sbsafemath", DBG_SB_SAFEMATH, "Disable unsafe math optimizations" },
  57. +   { "sbtgsi", DBG_SB_TGSI, "Use sb and its own tgsi translator"},
  58.  
  59.     DEBUG_NAMED_VALUE_END /* must be last */
  60.  };
  61. diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
  62. index 349a6cb..86537a5 100644
  63. --- a/src/gallium/drivers/r600/r600_pipe.h
  64. +++ b/src/gallium/drivers/r600/r600_pipe.h
  65. @@ -257,6 +257,7 @@ typedef boolean (*r600g_dma_blit_t)(struct pipe_context *ctx,
  66.  #define DBG_SB_NO_FALLBACK (1 << 26)
  67.  #define DBG_SB_DISASM  (1 << 27)
  68.  #define DBG_SB_SAFEMATH    (1 << 28)
  69. +#define DBG_SB_TGSI        (1 << 29)
  70.  
  71.  struct r600_tiling_info {
  72.     unsigned num_channels;
  73. diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
  74. index dc44fae..2023bab 100644
  75. --- a/src/gallium/drivers/r600/r600_shader.c
  76. +++ b/src/gallium/drivers/r600/r600_shader.c
  77. @@ -141,6 +141,7 @@ int r600_pipe_shader_create(struct pipe_context *ctx,
  78.     uint32_t *ptr;
  79.     bool dump = r600_can_dump_shader(rctx->screen, tgsi_get_processor_type(sel->tokens));
  80.     unsigned use_sb = rctx->screen->debug_flags & DBG_SB;
  81. +   unsigned use_sbtgsi = rctx->screen->debug_flags & DBG_SB_TGSI;
  82.     unsigned sb_disasm = use_sb || (rctx->screen->debug_flags & DBG_SB_DISASM);
  83.  
  84.     shader->shader.bc.isa = rctx->isa;
  85. @@ -153,35 +154,46 @@ int r600_pipe_shader_create(struct pipe_context *ctx,
  86.             r600_dump_streamout(&sel->so);
  87.         }
  88.     }
  89. -   r = r600_shader_from_tgsi(rctx->screen, shader, key);
  90. -   if (r) {
  91. -       R600_ERR("translation from TGSI failed !\n");
  92. -       return r;
  93. -   }
  94.  
  95. -   /* Check if the bytecode has already been built.  When using the llvm
  96. -    * backend, r600_shader_from_tgsi() will take care of building the
  97. -    * bytecode.
  98. -    */
  99. -   if (!shader->shader.bc.bytecode) {
  100. -       r = r600_bytecode_build(&shader->shader.bc);
  101. +   if (use_sbtgsi) {
  102. +       r = r600_sb_compile_tgsi(rctx, shader, key, dump);
  103.         if (r) {
  104. -           R600_ERR("building bytecode failed !\n");
  105. +           R600_ERR("SB: TGSI compilation failed!\n");
  106.             return r;
  107.         }
  108. -   }
  109.  
  110. -   if (dump && !sb_disasm) {
  111. -       fprintf(stderr, "--------------------------------------------------------------\n");
  112. -       r600_bytecode_disasm(&shader->shader.bc);
  113. -       fprintf(stderr, "______________________________________________________________\n");
  114. -   } else if ((dump && sb_disasm) || use_sb) {
  115. -       r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
  116. -                                    dump, use_sb);
  117. +   } else {
  118. +
  119. +       r = r600_shader_from_tgsi(rctx->screen, shader, key);
  120.         if (r) {
  121. -           R600_ERR("r600_sb_bytecode_process failed !\n");
  122. +           R600_ERR("translation from TGSI failed !\n");
  123.             return r;
  124.         }
  125. +
  126. +       /* Check if the bytecode has already been built.  When using the llvm
  127. +        * backend, r600_shader_from_tgsi() will take care of building the
  128. +        * bytecode.
  129. +        */
  130. +       if (!shader->shader.bc.bytecode) {
  131. +           r = r600_bytecode_build(&shader->shader.bc);
  132. +           if (r) {
  133. +               R600_ERR("building bytecode failed !\n");
  134. +               return r;
  135. +           }
  136. +       }
  137. +
  138. +       if (dump && !sb_disasm) {
  139. +           fprintf(stderr, "--------------------------------------------------------------\n");
  140. +           r600_bytecode_disasm(&shader->shader.bc);
  141. +           fprintf(stderr, "______________________________________________________________\n");
  142. +       } else if ((dump && sb_disasm) || use_sb) {
  143. +           r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
  144. +                                        dump, use_sb);
  145. +           if (r) {
  146. +               R600_ERR("r600_sb_bytecode_process failed !\n");
  147. +               return r;
  148. +           }
  149. +       }
  150.     }
  151.  
  152.     /* Store the shader in a buffer. */
  153. diff --git a/src/gallium/drivers/r600/sb/sb_bc.h b/src/gallium/drivers/r600/sb/sb_bc.h
  154. index ad1b862..89c8c4e 100644
  155. --- a/src/gallium/drivers/r600/sb/sb_bc.h
  156. +++ b/src/gallium/drivers/r600/sb/sb_bc.h
  157. @@ -902,14 +902,13 @@ public:
  158.  class bc_builder {
  159.     shader &sh;
  160.     sb_context &ctx;
  161. -   bytecode bb;
  162. +   bytecode &bb;
  163.     int error;
  164.  
  165.  public:
  166.  
  167.     bc_builder(shader &s);
  168.     int build();
  169. -   bytecode& get_bytecode() { assert(!error); return bb; }
  170.  
  171.  private:
  172.  
  173. diff --git a/src/gallium/drivers/r600/sb/sb_bc_builder.cpp b/src/gallium/drivers/r600/sb/sb_bc_builder.cpp
  174. index 55e2a85..9b065b6 100644
  175. --- a/src/gallium/drivers/r600/sb/sb_bc_builder.cpp
  176. +++ b/src/gallium/drivers/r600/sb/sb_bc_builder.cpp
  177. @@ -31,7 +31,7 @@
  178.  namespace r600_sb {
  179.  
  180.  bc_builder::bc_builder(shader &s)
  181. -   : sh(s), ctx(s.get_ctx()), bb(ctx.hw_class_bit()), error(0) {}
  182. +   : sh(s), ctx(s.get_ctx()), bb(s.get_bytecode()), error(0) {}
  183.  
  184.  int bc_builder::build() {
  185.  
  186. diff --git a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp
  187. index c56c866..4218990 100644
  188. --- a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp
  189. +++ b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp
  190. @@ -32,6 +32,8 @@
  191.  #define FBC_DUMP(q)
  192.  #endif
  193.  
  194. +#include "cmath"
  195. +
  196.  #include "sb_bc.h"
  197.  #include "sb_shader.h"
  198.  #include "sb_pass.h"
  199. @@ -306,22 +308,26 @@ void bc_finalizer::finalize_alu_src(alu_group_node* g, alu_node* a) {
  200.         sel_chan gpr;
  201.  
  202.         switch (v->kind) {
  203. -       case VLK_REL_REG:
  204. -           sc = v->get_final_gpr();
  205. -           src.sel = sc.sel();
  206. -           src.chan = sc.chan();
  207. -           if (!v->rel->is_const()) {
  208. -               src.rel = 1;
  209. -               update_ngpr(v->array->gpr.sel() + v->array->array_size -1);
  210. -           } else
  211. -               src.rel = 0;
  212. -
  213. -           break;
  214.         case VLK_REG:
  215. -           gpr = v->get_final_gpr();
  216. -           src.sel = gpr.sel();
  217. -           src.chan = gpr.chan();
  218. -           update_ngpr(src.sel);
  219. +       case VLK_TGSI_INPUT:
  220. +       case VLK_TGSI_OUTPUT:
  221. +       case VLK_TGSI_TEMP:
  222. +       case VLK_TGSI_ADDR:
  223. +           if (v->rel) {
  224. +               sc = v->get_final_gpr();
  225. +               src.sel = sc.sel();
  226. +               src.chan = sc.chan();
  227. +               if (!v->rel->is_const()) {
  228. +                   src.rel = 1;
  229. +                   update_ngpr(v->array->gpr.sel() + v->array->array_size -1);
  230. +               } else
  231. +                   src.rel = 0;
  232. +           } else {
  233. +               gpr = v->get_final_gpr();
  234. +               src.sel = gpr.sel();
  235. +               src.chan = gpr.chan();
  236. +               update_ngpr(src.sel);
  237. +           }
  238.             break;
  239.         case VLK_TEMP:
  240.             src.sel = v->gpr.sel();
  241. @@ -333,13 +339,32 @@ void bc_finalizer::finalize_alu_src(alu_group_node* g, alu_node* a) {
  242.             literal lv = v->literal_value;
  243.             src.chan = 0;
  244.  
  245. +           if (src.abs) {
  246. +               lv.f = fabs(lv.f);
  247. +               src.abs = 0;
  248. +           }
  249. +           if (src.neg) {
  250. +               lv.f = -lv.f;
  251. +               src.neg = 0;
  252. +           }
  253. +
  254.             if (lv == literal(0))
  255.                 src.sel = ALU_SRC_0;
  256. -           else if (lv == literal(0.5f))
  257. +           else if (lv == literal(0x80000000)) {
  258. +               // XXX probably we shouldn't have -0 here in the first place?
  259. +               src.sel = ALU_SRC_0;
  260. +               src.neg = 1;
  261. +           } else if (lv == literal(0.5f))
  262.                 src.sel = ALU_SRC_0_5;
  263. -           else if (lv == literal(1.0f))
  264. +           else if (lv == literal(-0.5f)) {
  265. +               src.sel = ALU_SRC_0_5;
  266. +               src.neg = 1;
  267. +           } else if (lv == literal(1.0f))
  268. +               src.sel = ALU_SRC_1;
  269. +           else if (lv == literal(-1.0f)) {
  270.                 src.sel = ALU_SRC_1;
  271. -           else if (lv == literal(1))
  272. +               src.neg = 1;
  273. +           } else if (lv == literal(1))
  274.                 src.sel = ALU_SRC_1_INT;
  275.             else if (lv == literal(-1))
  276.                 src.sel = ALU_SRC_M_1_INT;
  277. @@ -477,9 +502,11 @@ void bc_finalizer::finalize_fetch(fetch_node* f) {
  278.  
  279.         value *v = f->src[chan];
  280.  
  281. -       if (v->is_undef()) {
  282. +       if (!v)
  283.             sel = SEL_MASK;
  284. -       } else if (v->is_const()) {
  285. +       else if (v->is_undef())
  286. +           sel = SEL_0;
  287. +       else if (v->is_const()) {
  288.             literal l = v->literal_value;
  289.             if (l == literal(0))
  290.                 sel = SEL_0;
  291. diff --git a/src/gallium/drivers/r600/sb/sb_bc_parser.cpp b/src/gallium/drivers/r600/sb/sb_bc_parser.cpp
  292. index 67e6c3a..320a081 100644
  293. --- a/src/gallium/drivers/r600/sb/sb_bc_parser.cpp
  294. +++ b/src/gallium/drivers/r600/sb/sb_bc_parser.cpp
  295. @@ -112,7 +112,7 @@ int bc_parser::parse_decls() {
  296.  
  297.     if (!pshader) {
  298.         if (gpr_reladdr)
  299. -           sh->add_gpr_array(0, bc->ngpr, 0x0F);
  300. +           sh->add_rel_array(VLK_REG, 0, bc->ngpr, 0x0F);
  301.  
  302.         // compute shaders have some values preloaded in R0, R1
  303.         sh->add_input(0 /* GPR */, true /* preloaded */, 0x0F /* mask */);
  304. @@ -127,10 +127,11 @@ int bc_parser::parse_decls() {
  305.         if (pshader->num_arrays) {
  306.             for (unsigned i = 0; i < pshader->num_arrays; ++i) {
  307.                 r600_shader_array &a = pshader->arrays[i];
  308. -               sh->add_gpr_array(a.gpr_start, a.gpr_count, a.comp_mask);
  309. +               sh->add_rel_array(VLK_REG, a.gpr_start, a.gpr_count,
  310. +                   a.comp_mask);
  311.             }
  312.         } else {
  313. -           sh->add_gpr_array(0, pshader->bc.ngpr, 0x0F);
  314. +           sh->add_rel_array(VLK_REG, 0, pshader->bc.ngpr, 0x0F);
  315.         }
  316.     }
  317.  
  318. @@ -329,7 +330,7 @@ int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) {
  319.         unsigned src_count = n->bc.op_ptr->src_count;
  320.  
  321.         if (ctx.alu_slots(n->bc.op) & AF_4SLOT)
  322. -           n->flags |= NF_ALU_4SLOT;
  323. +           n->flags |= NF_ALU_4SLOT; // XXX it seems flag is used in dump only
  324.  
  325.         n->src.resize(src_count);
  326.  
  327. @@ -381,6 +382,14 @@ int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) {
  328.             bc_alu_src &src = n->bc.src[s];
  329.  
  330.             if (src.sel == ALU_SRC_LITERAL) {
  331. +               if (src.abs) {
  332. +                   src.value.f = fabs(src.value.f);
  333. +                   src.abs = 0;
  334. +               }
  335. +               if (src.neg) {
  336. +                   src.value.f = -src.value.f;
  337. +                   src.neg = 0;
  338. +               }
  339.                 n->src[s] = sh->get_const_value(src.value);
  340.             } else if (src.sel == ALU_SRC_PS || src.sel == ALU_SRC_PV) {
  341.                 unsigned pgroup = !cgroup, prev_slot = src.sel == ALU_SRC_PS ?
  342. diff --git a/src/gallium/drivers/r600/sb/sb_core.cpp b/src/gallium/drivers/r600/sb/sb_core.cpp
  343. index d907508..08dc032 100644
  344. --- a/src/gallium/drivers/r600/sb/sb_core.cpp
  345. +++ b/src/gallium/drivers/r600/sb/sb_core.cpp
  346. @@ -28,7 +28,6 @@
  347.  
  348.  extern "C" {
  349.  #include "os/os_time.h"
  350. -#include "r600_pipe.h"
  351.  #include "r600_shader.h"
  352.  
  353.  #include "sb_public.h"
  354. @@ -41,6 +40,7 @@ extern "C" {
  355.  #include "sb_shader.h"
  356.  #include "sb_pass.h"
  357.  #include "sb_sched.h"
  358. +#include "sb_tgsi.h"
  359.  
  360.  using namespace r600_sb;
  361.  
  362. @@ -89,73 +89,9 @@ void r600_sb_context_destroy(void * sctx) {
  363.     }
  364.  }
  365.  
  366. -int r600_sb_bytecode_process(struct r600_context *rctx,
  367. -                             struct r600_bytecode *bc,
  368. -                             struct r600_shader *pshader,
  369. -                             int dump_bytecode,
  370. -                             int optimize) {
  371. -   int r = 0;
  372. -   unsigned shader_id = bc->debug_id;
  373. +static int sb_build_bytecode(shader *sh, int dump_bytecode) {
  374.  
  375. -   sb_context *ctx = (sb_context *)rctx->sb_context;
  376. -   if (!ctx) {
  377. -       rctx->sb_context = ctx = r600_sb_context_create(rctx);
  378. -   }
  379. -
  380. -   int64_t time_start = 0;
  381. -   if (sb_context::dump_stat) {
  382. -       time_start = os_time_get_nano();
  383. -   }
  384. -
  385. -   SB_DUMP_STAT( sblog << "\nsb: shader " << shader_id << "\n"; );
  386. -
  387. -   bc_parser parser(*ctx, bc, pshader);
  388. -
  389. -   if ((r = parser.decode())) {
  390. -       assert(!"sb: bytecode decoding error");
  391. -       return r;
  392. -   }
  393. -
  394. -   shader *sh = parser.get_shader();
  395. -
  396. -   if (dump_bytecode) {
  397. -       bc_dump(*sh, bc->bytecode, bc->ndw).run();
  398. -   }
  399. -
  400. -   if (!optimize) {
  401. -       delete sh;
  402. -       return 0;
  403. -   }
  404. -
  405. -   if (sh->target != TARGET_FETCH) {
  406. -       sh->src_stats.ndw = bc->ndw;
  407. -       sh->collect_stats(false);
  408. -   }
  409. -
  410. -   /* skip some shaders (use shaders from default backend)
  411. -    * dskip_start - range start, dskip_end - range_end,
  412. -    * e.g. start = 5, end = 6 means shaders 5 & 6
  413. -    *
  414. -    * dskip_mode == 0 - disabled,
  415. -    * dskip_mode == 1 - don't process the shaders from the [start;end] range
  416. -    * dskip_mode == 2 - process only the shaders from the range
  417. -    */
  418. -   if (sb_context::dskip_mode) {
  419. -       if ((sb_context::dskip_start <= shader_id &&
  420. -               shader_id <= sb_context::dskip_end) ==
  421. -                       (sb_context::dskip_mode == 1)) {
  422. -           sblog << "sb: skipped shader " << shader_id << " : " << "["
  423. -                   << sb_context::dskip_start << "; "
  424. -                   << sb_context::dskip_end << "] mode "
  425. -                   << sb_context::dskip_mode << "\n";
  426. -           return 0;
  427. -       }
  428. -   }
  429. -
  430. -   if ((r = parser.prepare())) {
  431. -       assert(!"sb: bytecode parsing error");
  432. -       return r;
  433. -   }
  434. +   int r;
  435.  
  436.     SB_DUMP_PASS( sblog << "\n\n###### after parse\n"; sh->dump_ir(); );
  437.  
  438. @@ -240,29 +176,147 @@ int r600_sb_bytecode_process(struct r600_context *rctx,
  439.         return r;
  440.     }
  441.  
  442. -   bytecode &nbc = builder.get_bytecode();
  443. +   bytecode &nbc = sh->get_bytecode();
  444.  
  445.     if (dump_bytecode) {
  446.         bc_dump(*sh, &nbc).run();
  447.     }
  448.  
  449. -   if (!sb_context::dry_run) {
  450. +   return 0;
  451. +}
  452.  
  453. -       free(bc->bytecode);
  454. -       bc->ndw = nbc.ndw();
  455. -       bc->bytecode = (uint32_t*) malloc(bc->ndw << 2);
  456. -       nbc.write_data(bc->bytecode);
  457.  
  458. -       bc->ngpr = sh->ngpr;
  459. -       bc->nstack = sh->nstack;
  460. -   } else {
  461. -       SB_DUMP_STAT( sblog << "sb: dry run: optimized bytecode is not used\n"; );
  462. +int r600_sb_compile_tgsi(struct r600_context *rctx,
  463. +                           struct r600_pipe_shader *pipe_shader,
  464. +                           struct r600_shader_key key,
  465. +                           int dump_bytecode) {
  466. +   int r = 0;
  467. +
  468. +   r600_bytecode *bc = &pipe_shader->shader.bc;
  469. +   r600_bytecode_init(bc, rctx->chip_class, rctx->family,
  470. +                      rctx->screen->has_compressed_msaa_texturing);
  471. +
  472. +   sb_context *ctx = (sb_context *)rctx->sb_context;
  473. +   if (!ctx) {
  474. +       rctx->sb_context = ctx = r600_sb_context_create(rctx);
  475. +   }
  476. +
  477. +   int64_t time_start = 0;
  478. +   if (sb_context::dump_stat) {
  479. +       time_start = os_time_get_nano();
  480.     }
  481.  
  482. +   unsigned shader_id = bc->debug_id;
  483. +   SB_DUMP_STAT( sblog << "\nsb: shader " << shader_id << "\n"; );
  484. +
  485. +   // translate from tgsi
  486. +
  487. +   tgsi_translator tt(*ctx, pipe_shader, key, shader_id);
  488. +
  489. +   shader* sh = tt.translate();
  490. +   assert(sh);
  491. +
  492. +   r = sb_build_bytecode(sh, dump_bytecode);
  493. +   if (r)
  494. +       return r;
  495. +
  496.     if (sb_context::dump_stat) {
  497.         int64_t t = os_time_get_nano() - time_start;
  498.  
  499. -       sblog << "sb: processing shader " << shader_id << " done ( "
  500. +       sblog << "sb: tgsi compilation of shader " << sh->id << " done ( "
  501. +               << ((double)t)/1000000.0 << " ms ).\n";
  502. +   }
  503. +
  504. +   bytecode &nbc = sh->get_bytecode();
  505. +
  506. +   bc->ndw = nbc.ndw();
  507. +   bc->bytecode = (uint32_t*) malloc(bc->ndw << 2);
  508. +   nbc.write_data(bc->bytecode);
  509. +
  510. +   bc->ngpr = sh->ngpr;
  511. +   bc->nstack = sh->nstack;
  512. +
  513. +   delete sh;
  514. +   return 0;
  515. +}
  516. +
  517. +int r600_sb_bytecode_process(struct r600_context *rctx,
  518. +                             struct r600_bytecode *bc,
  519. +                             struct r600_shader *pshader,
  520. +                             int dump_bytecode,
  521. +                             int optimize) {
  522. +   int r = 0;
  523. +   unsigned shader_id = bc->debug_id;
  524. +
  525. +   sb_context *ctx = (sb_context *)rctx->sb_context;
  526. +   if (!ctx) {
  527. +       rctx->sb_context = ctx = r600_sb_context_create(rctx);
  528. +   }
  529. +
  530. +   int64_t time_start = 0;
  531. +   if (sb_context::dump_stat) {
  532. +       time_start = os_time_get_nano();
  533. +   }
  534. +
  535. +   SB_DUMP_STAT( sblog << "\nsb: shader " << shader_id << "\n"; );
  536. +
  537. +   bc_parser parser(*ctx, bc, pshader);
  538. +
  539. +   if ((r = parser.decode())) {
  540. +       assert(!"sb: bytecode decoding error");
  541. +       return r;
  542. +   }
  543. +
  544. +   shader *sh = parser.get_shader();
  545. +
  546. +   if (dump_bytecode) {
  547. +       bc_dump(*sh, bc->bytecode, bc->ndw).run();
  548. +   }
  549. +
  550. +   if (!optimize) {
  551. +       delete sh;
  552. +       return 0;
  553. +   }
  554. +
  555. +   if (sh->target != TARGET_FETCH) {
  556. +       sh->src_stats.ndw = bc->ndw;
  557. +       sh->collect_stats(false);
  558. +   }
  559. +
  560. +   /* skip some shaders (use shaders from default backend)
  561. +    * dskip_start - range start, dskip_end - range_end,
  562. +    * e.g. start = 5, end = 6 means shaders 5 & 6
  563. +    *
  564. +    * dskip_mode == 0 - disabled,
  565. +    * dskip_mode == 1 - don't process the shaders from the [start;end] range
  566. +    * dskip_mode == 2 - process only the shaders from the range
  567. +    */
  568. +   if (sb_context::dskip_mode) {
  569. +       if ((sb_context::dskip_start <= shader_id &&
  570. +               shader_id <= sb_context::dskip_end) ==
  571. +                       (sb_context::dskip_mode == 1)) {
  572. +           sblog << "sb: skipped shader " << shader_id << " : " << "["
  573. +                   << sb_context::dskip_start << "; "
  574. +                   << sb_context::dskip_end << "] mode "
  575. +                   << sb_context::dskip_mode << "\n";
  576. +           return 0;
  577. +       }
  578. +   }
  579. +
  580. +   if ((r = parser.prepare())) {
  581. +       assert(!"sb: bytecode parsing error");
  582. +       return r;
  583. +   }
  584. +
  585. +
  586. +   r = sb_build_bytecode(sh, dump_bytecode);
  587. +   if (r)
  588. +       return r;
  589. +
  590. +   if (sb_context::dump_stat) {
  591. +       int64_t t = os_time_get_nano() - time_start;
  592. +
  593. +       sblog << "sb: processing shader " << sh->id << " done ( "
  594.                 << ((double)t)/1000000.0 << " ms ).\n";
  595.  
  596.         sh->opt_stats.ndw = bc->ndw;
  597. @@ -276,6 +330,20 @@ int r600_sb_bytecode_process(struct r600_context *rctx,
  598.         sh->src_stats.dump_diff(sh->opt_stats);
  599.     }
  600.  
  601. +   if (!sb_context::dry_run) {
  602. +       bytecode &nbc = sh->get_bytecode();
  603. +
  604. +       free(bc->bytecode);
  605. +       bc->ndw = nbc.ndw();
  606. +       bc->bytecode = (uint32_t*) malloc(bc->ndw << 2);
  607. +       nbc.write_data(bc->bytecode);
  608. +
  609. +       bc->ngpr = sh->ngpr;
  610. +       bc->nstack = sh->nstack;
  611. +   } else {
  612. +       SB_DUMP_STAT( sblog << "sb: dry run: optimized bytecode is not used\n"; );
  613. +   }
  614. +
  615.     delete sh;
  616.     return 0;
  617.  }
  618. diff --git a/src/gallium/drivers/r600/sb/sb_ir.h b/src/gallium/drivers/r600/sb/sb_ir.h
  619. index c838f62..716af59 100644
  620. --- a/src/gallium/drivers/r600/sb/sb_ir.h
  621. +++ b/src/gallium/drivers/r600/sb/sb_ir.h
  622. @@ -48,19 +48,33 @@ class node;
  623.  class value;
  624.  class shader;
  625.  
  626. +enum sel_class {
  627. +   SC_GPR = 0,
  628. +   SC_TGSI_INPUT = 1,
  629. +   SC_TGSI_OUTPUT = 2,
  630. +   SC_TGSI_ADDR = 3,
  631. +   SC_TGSI_CONST = 4,
  632. +
  633. +};
  634. +
  635. +
  636.  struct sel_chan
  637.  {
  638. +   static const unsigned reg_shift = 2;
  639. +   static const unsigned chan_mask = (1u << reg_shift) - 1;
  640. +
  641.     unsigned id;
  642.  
  643.     sel_chan(unsigned id = 0) : id(id) {}
  644. -   sel_chan(unsigned sel, unsigned chan) : id(((sel << 2) | chan) + 1) {}
  645. +   sel_chan(unsigned sel, unsigned chan)
  646. +       : id(((sel << 2) | chan) + 1) {}
  647.  
  648.     unsigned sel() const { return sel(id); }
  649.     unsigned chan() const {return chan(id); }
  650.     operator unsigned() const {return id;}
  651.  
  652.     static unsigned sel(unsigned idx) { return (idx-1) >> 2; }
  653. -   static unsigned chan(unsigned idx) { return (idx-1) & 3; }
  654. +   static unsigned chan(unsigned idx) { return (idx-1) & chan_mask; }
  655.  };
  656.  
  657.  inline sb_ostream& operator <<(sb_ostream& o, sel_chan r) {
  658. @@ -258,8 +272,9 @@ public:
  659.  class value;
  660.  
  661.  enum value_kind {
  662. +   VLK_INVALID,
  663. +
  664.     VLK_REG,
  665. -   VLK_REL_REG,
  666.     VLK_SPECIAL_REG,
  667.     VLK_TEMP,
  668.  
  669. @@ -268,6 +283,12 @@ enum value_kind {
  670.     VLK_PARAM,
  671.     VLK_SPECIAL_CONST,
  672.  
  673. +   VLK_TGSI_INPUT,
  674. +   VLK_TGSI_OUTPUT,
  675. +   VLK_TGSI_TEMP,
  676. +   VLK_TGSI_ADDR,
  677. +
  678. +
  679.     VLK_UNDEF
  680.  };
  681.  
  682. @@ -371,24 +392,29 @@ public:
  683.  
  684.  typedef sb_value_set val_set;
  685.  
  686. -struct gpr_array {
  687. -   sel_chan base_gpr; // original gpr
  688. +struct rel_array {
  689. +   value_kind kind;
  690. +
  691. +   sel_chan base_sel;
  692. +   sel_chan pin_gpr;
  693.     sel_chan gpr; // assigned by regalloc
  694.     unsigned array_size;
  695. +   unsigned array_id;
  696.  
  697. -   gpr_array(sel_chan base_gpr, unsigned array_size) : base_gpr(base_gpr),
  698. -           array_size(array_size) {}
  699. +   rel_array(value_kind kind, sel_chan base_sel, unsigned array_size,
  700. +             unsigned array_id)
  701. +       : kind(kind), base_sel(base_sel), pin_gpr(), gpr(),
  702. +         array_size(array_size), array_id(array_id), interferences(), refs() {}
  703.  
  704. -   unsigned hash() { return (base_gpr << 10) * array_size; }
  705. +   unsigned hash() { return ((kind << 16) | (base_sel)) * array_size; }
  706.  
  707.     val_set interferences;
  708.     vvec refs;
  709.  
  710.     bool is_dead();
  711. -
  712.  };
  713.  
  714. -typedef std::vector<gpr_array*> regarray_vec;
  715. +typedef std::vector<rel_array*> regarray_vec;
  716.  
  717.  enum value_flags {
  718.     VLF_UNDEF = (1 << 0),
  719. @@ -467,7 +493,7 @@ protected:
  720.     value(unsigned sh_id, value_kind k, sel_chan select, unsigned ver = 0)
  721.         : kind(k), flags(),
  722.             rel(), array(),
  723. -           version(ver), select(select), pin_gpr(select), gpr(),
  724. +           version(ver), select(select), pin_gpr(), gpr(),
  725.             gvn_source(), ghash(),
  726.             def(), adef(), uses(), constraint(), chunk(),
  727.             literal_value(), uid(sh_id) {}
  728. @@ -482,7 +508,7 @@ public:
  729.     vvec mdef;
  730.     vvec muse;
  731.     value *rel;
  732. -   gpr_array *array;
  733. +   rel_array *array;
  734.  
  735.     unsigned version;
  736.  
  737. @@ -528,8 +554,14 @@ public:
  738.  
  739.     bool is_undef() { return gvalue()->kind == VLK_UNDEF; }
  740.  
  741. +   bool is_tgsi_value() {
  742. +       return kind == VLK_TGSI_INPUT || kind == VLK_TGSI_OUTPUT ||
  743. +               kind == VLK_TGSI_TEMP || kind == VLK_TGSI_ADDR;
  744. +   }
  745. +
  746.     bool is_any_gpr() {
  747. -       return (kind == VLK_REG || kind == VLK_TEMP);
  748. +       return (!rel &&
  749. +               (kind == VLK_REG || kind == VLK_TEMP || is_tgsi_value()));
  750.     }
  751.  
  752.     bool is_agpr() {
  753. @@ -544,7 +576,7 @@ public:
  754.     bool is_special_reg() { return kind == VLK_SPECIAL_REG; }
  755.     bool is_any_reg() { return is_any_gpr() || is_special_reg(); }
  756.     bool is_kcache() { return kind == VLK_KCACHE; }
  757. -   bool is_rel() { return kind == VLK_REL_REG; }
  758. +   bool is_rel() { return rel != NULL; }
  759.     bool is_readonly() { return flags & VLF_READONLY; }
  760.  
  761.     bool is_chan_pinned() { return flags & VLF_PIN_CHAN; }
  762. @@ -573,8 +605,10 @@ public:
  763.                 && literal_value != literal(0)
  764.                 && literal_value != literal(1)
  765.                 && literal_value != literal(-1)
  766. -               && literal_value != literal(0.5)
  767. -               && literal_value != literal(1.0);
  768. +               && literal_value != literal(0.5f)
  769. +               && literal_value != literal(-0.5f)
  770. +               && literal_value != literal(1.0f)
  771. +               && literal_value != literal(-1.0f);
  772.     }
  773.  
  774.     void add_use(node *n, use_kind kind, int arg);
  775. @@ -594,7 +628,7 @@ public:
  776.  
  777.     sel_chan get_final_gpr() {
  778.         if (array && array->gpr) {
  779. -           int reg_offset = select.sel() - array->base_gpr.sel();
  780. +           int reg_offset = select.sel() - array->base_sel.sel();
  781.             if (rel && rel->is_const())
  782.                 reg_offset += rel->get_const_value().i;
  783.             return array->gpr + (reg_offset << 2);
  784. @@ -755,7 +789,7 @@ protected:
  785.     node(node_type nt, node_subtype nst, node_flags flags = NF_EMPTY)
  786.     : prev(), next(), parent(),
  787.       type(nt), subtype(nst), flags(flags),
  788. -     pred(), dst(), src() {}
  789. +     pred(), dst(), src(), source_line() {}
  790.  
  791.     virtual ~node() {};
  792.  
  793. @@ -772,6 +806,8 @@ public:
  794.     vvec dst;
  795.     vvec src;
  796.  
  797. +   unsigned source_line;
  798. +
  799.     virtual bool is_valid() { return true; }
  800.     virtual bool accept(vpass &p, bool enter);
  801.  
  802. diff --git a/src/gallium/drivers/r600/sb/sb_liveness.cpp b/src/gallium/drivers/r600/sb/sb_liveness.cpp
  803. index 8ecc9a5..d683ea9 100644
  804. --- a/src/gallium/drivers/r600/sb/sb_liveness.cpp
  805. +++ b/src/gallium/drivers/r600/sb/sb_liveness.cpp
  806. @@ -345,10 +345,10 @@ void liveness::process_op(node& n) {
  807.  int liveness::init() {
  808.  
  809.     if (sh.compute_interferences) {
  810. -       gpr_array_vec &vv = sh.arrays();
  811. -       for (gpr_array_vec::iterator I = vv.begin(), E = vv.end(); I != E;
  812. +       rel_array_vec &vv = sh.arrays();
  813. +       for (rel_array_vec::iterator I = vv.begin(), E = vv.end(); I != E;
  814.                 ++I) {
  815. -           gpr_array *a = *I;
  816. +           rel_array *a = *I;
  817.             a->interferences.clear();
  818.         }
  819.     }
  820. diff --git a/src/gallium/drivers/r600/sb/sb_pass.h b/src/gallium/drivers/r600/sb/sb_pass.h
  821. index 95d2a20..64d2e1d 100644
  822. --- a/src/gallium/drivers/r600/sb/sb_pass.h
  823. +++ b/src/gallium/drivers/r600/sb/sb_pass.h
  824. @@ -588,7 +588,9 @@ class ssa_prepare : public vpass {
  825.     unsigned level;
  826.  
  827.  public:
  828. -   ssa_prepare(shader &s) : vpass(s), level(0) {}
  829. +   ssa_prepare(shader &s) : vpass(s), stk(), level(0) {}
  830. +
  831. +   virtual int init() { stk.resize(1); return 0; }
  832.  
  833.     virtual bool visit(cf_node &n, bool enter);
  834.     virtual bool visit(alu_node &n, bool enter);
  835. diff --git a/src/gallium/drivers/r600/sb/sb_public.h b/src/gallium/drivers/r600/sb/sb_public.h
  836. index c9f5f97..c42ef2e 100644
  837. --- a/src/gallium/drivers/r600/sb/sb_public.h
  838. +++ b/src/gallium/drivers/r600/sb/sb_public.h
  839. @@ -31,6 +31,11 @@ struct r600_shader;
  840.  
  841.  void r600_sb_context_destroy(void *sctx);
  842.  
  843. +int r600_sb_compile_tgsi(struct r600_context *rctx,
  844. +                           struct r600_pipe_shader *pipe_shader,
  845. +                           struct r600_shader_key key,
  846. +                           int dump);
  847. +
  848.  int r600_sb_bytecode_process(struct r600_context *rctx,
  849.                               struct r600_bytecode *bc,
  850.                               struct r600_shader *pshader,
  851. diff --git a/src/gallium/drivers/r600/sb/sb_ra_init.cpp b/src/gallium/drivers/r600/sb/sb_ra_init.cpp
  852. index 0b332a9..856a2d9 100644
  853. --- a/src/gallium/drivers/r600/sb/sb_ra_init.cpp
  854. +++ b/src/gallium/drivers/r600/sb/sb_ra_init.cpp
  855. @@ -259,13 +259,13 @@ sel_chan regbits::find_free_chan_by_mask(unsigned mask) {
  856.  
  857.  void ra_init::alloc_arrays() {
  858.  
  859. -   gpr_array_vec &ga = sh.arrays();
  860. +   rel_array_vec &ga = sh.arrays();
  861.  
  862. -   for(gpr_array_vec::iterator I = ga.begin(), E = ga.end(); I != E; ++I) {
  863. -       gpr_array *a = *I;
  864. +   for(rel_array_vec::iterator I = ga.begin(), E = ga.end(); I != E; ++I) {
  865. +       rel_array *a = *I;
  866.  
  867.         RA_DUMP(
  868. -           sblog << "array [" << a->array_size << "] at " << a->base_gpr << "\n";
  869. +           sblog << "array [" << a->array_size << "] at " << a->base_sel << "\n";
  870.             sblog << "\n";
  871.         );
  872.  
  873. @@ -300,7 +300,7 @@ void ra_init::alloc_arrays() {
  874.         regbits rb(sh, s);
  875.  
  876.         sel_chan base = rb.find_free_array(a->array_size,
  877. -                                          (1 << a->base_gpr.chan()));
  878. +                                          (1 << a->base_sel.chan()));
  879.  
  880.         RA_DUMP( sblog << "  found base: " << base << "\n"; );
  881.  
  882. @@ -349,9 +349,7 @@ void ra_init::process_op(node* n) {
  883.                 break;
  884.             }
  885.         }
  886. -   }
  887. -
  888. -   if (n->is_fetch_inst() || n->is_cf_inst()) {
  889. +   } else if (n->is_fetch_inst() || n->is_cf_inst()) {
  890.         for (vvec::iterator I = n->src.begin(), E = n->src.end(); I != E; ++I) {
  891.             value *v = *I;
  892.             if (v && v->is_sgpr())
  893. @@ -684,7 +682,6 @@ void ra_split::split_packed_ins(alu_packed_node *n) {
  894.     }
  895.  }
  896.  
  897. -// TODO handle other packed ops for cayman
  898.  void ra_split::split_alu_packed(alu_packed_node* n) {
  899.     switch (n->op()) {
  900.         case ALU_OP2_DOT4:
  901. @@ -692,6 +689,11 @@ void ra_split::split_alu_packed(alu_packed_node* n) {
  902.             split_packed_ins(n);
  903.             break;
  904.         default:
  905. +           if (ctx.is_cayman()) {
  906. +               unsigned slots = ctx.alu_slots(n->op_ptr());
  907. +               if (slots == AF_4VE || slots == AF_S)
  908. +                   split_packed_ins(n);
  909. +           }
  910.             break;
  911.     }
  912.  }
  913. @@ -723,7 +725,7 @@ void ra_split::split_vec(vvec &vv, vvec &v1, vvec &v2, bool allow_swz) {
  914.  
  915.                 if (!allow_swz) {
  916.                     t->flags |= VLF_PIN_CHAN;
  917. -                   t->pin_gpr = sel_chan(0, ch);
  918. +                   t->pin_gpr = sel_chan(t->pin_gpr.sel(), ch);
  919.                 }
  920.  
  921.                 v2.push_back(o);
  922. @@ -812,6 +814,11 @@ void ra_split::split_vector_inst(node* n) {
  923.                     } else
  924.                         sel = s->select;
  925.  
  926. +                   // FIXME: handle this more cleanly
  927. +                   // (propagate pin_gpr in ssa rename)
  928. +                   if (s->kind == VLK_TGSI_INPUT)
  929. +                       sel = sel_chan(sel.sel() + 1, sel.chan());
  930. +
  931.                     v->gpr = v->pin_gpr = sel;
  932.                     v->fix();
  933.                 }
  934. diff --git a/src/gallium/drivers/r600/sb/sb_sched.cpp b/src/gallium/drivers/r600/sb/sb_sched.cpp
  935. index f0e41f5..fd0f761 100644
  936. --- a/src/gallium/drivers/r600/sb/sb_sched.cpp
  937. +++ b/src/gallium/drivers/r600/sb/sb_sched.cpp
  938. @@ -939,9 +939,10 @@ void post_scheduler::update_live(node *n, val_set *born) {
  939.  
  940.  void post_scheduler::process_group() {
  941.     alu_group_tracker &rt = alu.grp();
  942. -
  943.     val_set vals_born;
  944.  
  945. +   prev_array_read.clear();
  946. +
  947.     recolor_locals();
  948.  
  949.     PSC_DUMP(
  950. @@ -956,6 +957,7 @@ void post_scheduler::process_group() {
  951.             continue;
  952.  
  953.         update_live(n, &vals_born);
  954. +       update_prev_array_read(n);
  955.     }
  956.  
  957.     PSC_DUMP(
  958. @@ -1014,7 +1016,10 @@ void post_scheduler::schedule_alu(container_node *c) {
  959.         prev_regmap = regmap;
  960.  
  961.         if (!prepare_alu_group()) {
  962. -           if (alu.current_ar) {
  963. +           if (latency_check_failed) {
  964. +               emit_nop_group();
  965. +               continue;
  966. +           } else if (alu.current_ar) {
  967.                 emit_load_ar();
  968.                 continue;
  969.             } else
  970. @@ -1263,6 +1268,11 @@ bool post_scheduler::map_src_val(value *v) {
  971.         return true;
  972.  
  973.     sel_chan gpr = v->get_final_gpr();
  974. +
  975. +   PSC_DUMP(
  976. +       sblog << "map src " << *v << " to " << gpr << "\n";
  977. +   );
  978. +
  979.     rv_map::iterator F = regmap.find(gpr);
  980.     value *c = NULL;
  981.     if (F != regmap.end()) {
  982. @@ -1436,6 +1446,11 @@ unsigned post_scheduler::try_add_instruction(node *n) {
  983.  
  984.     alu_group_tracker &rt = alu.grp();
  985.  
  986. +#if 0  // this seems not a problem so far at least on evergreen
  987. +   if (!check_latency(n))
  988. +       return 0;
  989. +#endif
  990. +
  991.     unsigned avail_slots = rt.avail_slots();
  992.  
  993.     if (n->is_alu_packed()) {
  994. @@ -1606,6 +1621,8 @@ bool post_scheduler::prepare_alu_group() {
  995.  
  996.     alu_group_tracker &rt = alu.grp();
  997.  
  998. +   latency_check_failed = false;
  999. +
  1000.     unsigned i1 = 0;
  1001.  
  1002.     PSC_DUMP(
  1003. @@ -1634,7 +1651,6 @@ bool post_scheduler::prepare_alu_group() {
  1004.                 sblog << "\n";
  1005.             );
  1006.  
  1007. -
  1008.             unsigned cnt = try_add_instruction(n);
  1009.  
  1010.             if (!cnt)
  1011. @@ -1970,4 +1986,60 @@ void rp_gpr_tracker::dump() {
  1012.     }
  1013.  }
  1014.  
  1015. +void post_scheduler::update_prev_array_read(alu_node* n) {
  1016. +   for (vvec::iterator I = n->src.begin(), E = n->src.end(); I != E; ++I) {
  1017. +       value *v = *I;
  1018. +
  1019. +       if (!v || !v->array)
  1020. +           continue;
  1021. +
  1022. +       prev_array_read.push_back(v);
  1023. +   }
  1024. +}
  1025. +
  1026. +bool post_scheduler::check_latency(node* n) {
  1027. +   for (vvec::iterator I = n->dst.begin(), E = n->dst.end(); I != E; ++I) {
  1028. +       value *d = *I;
  1029. +
  1030. +       if (!d || !d->array)
  1031. +           continue;
  1032. +
  1033. +       if (!check_value_latency(d))
  1034. +           return false;
  1035. +   }
  1036. +   return true;
  1037. +}
  1038. +
  1039. +bool post_scheduler::check_value_latency(value* v) {
  1040. +   for (vvec::iterator I = prev_array_read.begin(), E = prev_array_read.end();
  1041. +           I != E; ++I) {
  1042. +       value *r = *I;
  1043. +
  1044. +       if (r->array == v->array) {
  1045. +           bool rel_write = v->is_rel();
  1046. +           bool rel_read = r->is_rel();
  1047. +
  1048. +           if (rel_write ^ rel_read) {
  1049. +               latency_check_failed = true;
  1050. +               return false;
  1051. +           }
  1052. +       }
  1053. +   }
  1054. +   return true;
  1055. +}
  1056. +
  1057. +void post_scheduler::emit_nop_group() {
  1058. +   alu_node * a = sh.create_alu();
  1059. +   a->bc.set_op(ALU_OP0_NOP);
  1060. +
  1061. +   alu_group_tracker &rt = alu.grp();
  1062. +   if (!rt.try_reserve(a)) {
  1063. +       sblog << "can't emit NOP group : ";
  1064. +       dump::dump_op(a);
  1065. +       sblog << "\n";
  1066. +   }
  1067. +
  1068. +   alu.emit_group();
  1069. +}
  1070. +
  1071.  } // namespace r600_sb
  1072. diff --git a/src/gallium/drivers/r600/sb/sb_sched.h b/src/gallium/drivers/r600/sb/sb_sched.h
  1073. index a74484f..40e8b15 100644
  1074. --- a/src/gallium/drivers/r600/sb/sb_sched.h
  1075. +++ b/src/gallium/drivers/r600/sb/sb_sched.h
  1076. @@ -254,11 +254,15 @@ class post_scheduler : public pass {
  1077.  
  1078.     val_set cleared_interf;
  1079.  
  1080. +   vvec prev_array_read;
  1081. +   bool latency_check_failed;
  1082. +
  1083.  public:
  1084.  
  1085.     post_scheduler(shader &sh) : pass(sh),
  1086.         ready(), ready_copies(), pending(), cur_bb(),
  1087. -       live(), ucm(), alu(sh), regmap(), cleared_interf() {}
  1088. +       live(), ucm(), alu(sh), regmap(), cleared_interf(),
  1089. +       prev_array_read(), latency_check_failed() {}
  1090.  
  1091.     virtual int run();
  1092.     void run_on(container_node *n);
  1093. @@ -317,6 +321,11 @@ public:
  1094.     void emit_clause();
  1095.  
  1096.     void process_ready_copies();
  1097. +
  1098. +   void update_prev_array_read(alu_node *n);
  1099. +   bool check_latency(node *n);
  1100. +   bool check_value_latency(value *v);
  1101. +   void emit_nop_group();
  1102.  };
  1103.  
  1104.  } // namespace r600_sb
  1105. diff --git a/src/gallium/drivers/r600/sb/sb_shader.cpp b/src/gallium/drivers/r600/sb/sb_shader.cpp
  1106. index 9fc47ae..2443364 100644
  1107. --- a/src/gallium/drivers/r600/sb/sb_shader.cpp
  1108. +++ b/src/gallium/drivers/r600/sb/sb_shader.cpp
  1109. @@ -30,24 +30,23 @@
  1110.  
  1111.  namespace r600_sb {
  1112.  
  1113. -shader::shader(sb_context &sctx, shader_target t, unsigned id)
  1114. -: ctx(sctx), next_temp_value_index(temp_regid_offset),
  1115. -  prep_regs_count(), pred_sels(),
  1116. -  regions(), inputs(), undef(), val_pool(sizeof(value)),
  1117. -  pool(), all_nodes(), src_stats(), opt_stats(), errors(),
  1118. -  optimized(), id(id),
  1119. -  coal(*this), bbs(),
  1120. -  target(t), vt(ex), ex(*this), root(),
  1121. -  compute_interferences(),
  1122. -  has_alu_predication(), uses_gradients(), safe_math(), ngpr(), nstack() {}
  1123. +shader::shader(sb_context &sctx, shader_target t, unsigned id, bool direct_tgsi)
  1124. +       : ctx(sctx), next_temp_value_index(temp_regid_offset), pred_sels(),
  1125. +           regions(), inputs(), undef(), val_pool(sizeof(value)), pool(),
  1126. +           all_nodes(), bc(sctx.hw_class_bit()), src_stats(), opt_stats(),
  1127. +           errors(), optimized(), id(id), coal(*this), bbs(), target(t),
  1128. +           vt(ex), ex(*this), root(), compute_interferences(),
  1129. +           has_alu_predication(), uses_gradients(), safe_math(), ngpr(),
  1130. +           nstack(), direct_tgsi(direct_tgsi) {
  1131. +}
  1132.  
  1133.  bool shader::assign_slot(alu_node* n, alu_node *slots[5]) {
  1134.  
  1135.     unsigned slot_flags = ctx.alu_slots(n->bc.op);
  1136.     unsigned slot = n->bc.dst_chan;
  1137.  
  1138. -   if (!ctx.is_cayman() && (!(slot_flags & AF_V) || slots[slot]) &&
  1139. -           (slot_flags & AF_S))
  1140. +   if (!ctx.is_cayman() && (!(slot_flags & AF_V) || slots[slot])
  1141. +           && (slot_flags & AF_S))
  1142.         slot = SLOT_TRANS;
  1143.  
  1144.     if (slots[slot])
  1145. @@ -59,7 +58,7 @@ bool shader::assign_slot(alu_node* n, alu_node *slots[5]) {
  1146.  }
  1147.  
  1148.  void shader::add_pinned_gpr_values(vvec& vec, unsigned gpr, unsigned comp_mask,
  1149. -                            bool src) {
  1150. +                                   bool src) {
  1151.     unsigned chan = 0;
  1152.     while (comp_mask) {
  1153.         if (comp_mask & 1) {
  1154. @@ -72,7 +71,7 @@ void shader::add_pinned_gpr_values(vvec& vec, unsigned gpr, unsigned comp_mask,
  1155.             if (v->array && !v->array->gpr) {
  1156.                 // if pinned value can be accessed with indirect addressing
  1157.                 // pin the entire array to its original location
  1158. -               v->array->gpr = v->array->base_gpr;
  1159. +               v->array->gpr = v->array->base_sel;
  1160.             }
  1161.             vec.push_back(v);
  1162.         }
  1163. @@ -81,16 +80,49 @@ void shader::add_pinned_gpr_values(vvec& vec, unsigned gpr, unsigned comp_mask,
  1164.     }
  1165.  }
  1166.  
  1167. +void shader::add_pinned_inputs(vvec& vec, value_kind kind, unsigned sel,
  1168. +                               unsigned comp_mask, bool src,
  1169. +                               unsigned pin_gpr_sel) {
  1170. +   unsigned chan = 0;
  1171. +   while (comp_mask) {
  1172. +       if (comp_mask & 1) {
  1173. +           value *v = get_reg_value(kind, src, sel, chan, false);
  1174. +           if (!v->array) {
  1175. +               v->flags |= (VLF_PIN_REG | VLF_PIN_CHAN);
  1176. +               v->gpr = v->pin_gpr = sel_chan(pin_gpr_sel, chan);
  1177. +               v->fix();
  1178. +           }
  1179. +/*         if (v->array && !v->array->gpr) {
  1180. +               // if pinned value can be accessed with indirect addressing
  1181. +               // pin the entire array to its original location
  1182. +               v->array->gpr = sel_chan(
  1183. +                   pin_gpr_sel - (sel - v->array->base_sel), chan);
  1184. +           }
  1185. +*/         vec.push_back(v);
  1186. +       }
  1187. +       comp_mask >>= 1;
  1188. +       ++chan;
  1189. +   }
  1190. +}
  1191. +
  1192.  cf_node* shader::create_clause(node_subtype nst) {
  1193.     cf_node *n = create_cf();
  1194.  
  1195.     n->subtype = nst;
  1196.  
  1197.     switch (nst) {
  1198. -   case NST_ALU_CLAUSE: n->bc.set_op(CF_OP_ALU); break;
  1199. -   case NST_TEX_CLAUSE: n->bc.set_op(CF_OP_TEX); break;
  1200. -   case NST_VTX_CLAUSE: n->bc.set_op(CF_OP_VTX); break;
  1201. -   default: assert(!"invalid clause type"); break;
  1202. +   case NST_ALU_CLAUSE:
  1203. +       n->bc.set_op(CF_OP_ALU);
  1204. +       break;
  1205. +   case NST_TEX_CLAUSE:
  1206. +       n->bc.set_op(CF_OP_TEX);
  1207. +       break;
  1208. +   case NST_VTX_CLAUSE:
  1209. +       n->bc.set_op(CF_OP_VTX);
  1210. +       break;
  1211. +   default:
  1212. +       assert(!"invalid clause type");
  1213. +       break;
  1214.     }
  1215.  
  1216.     n->bc.barrier = 1;
  1217. @@ -127,13 +159,11 @@ alu_node* shader::create_copy_mov(value* dst, value* src, unsigned affcost) {
  1218.     return n;
  1219.  }
  1220.  
  1221. -value* shader::get_value(value_kind kind, sel_chan id,
  1222. -                         unsigned version) {
  1223. -   if (version == 0 && kind == VLK_REG && id.sel() < prep_regs_count)
  1224. -       return val_pool[id - 1];
  1225. +value* shader::get_value(value_kind kind, sel_chan id, unsigned version) {
  1226. +   unsigned key = (kind << 28) | (version << 14) | id;
  1227. +   assert((id & ((1 << 14) - 1)) == id);
  1228. +   assert((version & ((1 << 14) - 1)) == version);
  1229.  
  1230. -
  1231. -   unsigned key = (kind << 28) | (version << 16) | id;
  1232.     value_map::iterator i = reg_values.find(key);
  1233.     if (i != reg_values.end()) {
  1234.         return i->second;
  1235. @@ -148,53 +178,51 @@ value* shader::get_special_value(unsigned sv_id, unsigned version) {
  1236.     return get_value(VLK_SPECIAL_REG, id, version);
  1237.  }
  1238.  
  1239. -void shader::fill_array_values(gpr_array *a, vvec &vv) {
  1240. +void shader::fill_array_values(rel_array *a, vvec &vv) {
  1241.     unsigned sz = a->array_size;
  1242.     vv.resize(sz);
  1243.     for (unsigned i = 0; i < a->array_size; ++i) {
  1244. -       vv[i] = get_gpr_value(true, a->base_gpr.sel() + i, a->base_gpr.chan(),
  1245. -                             false);
  1246. +       vv[i] = get_reg_value(a->kind, true, a->base_sel.sel() + i,
  1247. +           a->base_sel.chan(), false);
  1248.     }
  1249.  }
  1250.  
  1251. -value* shader::get_gpr_value(bool src, unsigned reg, unsigned chan, bool rel,
  1252. -                             unsigned version) {
  1253. -   sel_chan id(reg, chan);
  1254. +value* shader::get_reg_value(value_kind kind, bool src, unsigned sel,
  1255. +                             unsigned chan, bool rel, value *r,
  1256. +                             unsigned arr_id) {
  1257. +   sel_chan id(sel, chan);
  1258.     value *v;
  1259. -   gpr_array *a = get_gpr_array(reg, chan);
  1260. +   rel_array *a = get_rel_array(kind, sel, chan);
  1261.     if (rel) {
  1262.         assert(a);
  1263. -       v = create_value(VLK_REL_REG, id, 0);
  1264. -       v->rel = get_special_value(SV_AR_INDEX);
  1265. +       v = create_value(kind, id, 0);
  1266. +       if (!r)
  1267. +           r = get_special_value(SV_AR_INDEX);
  1268. +       v->rel = r;
  1269.         fill_array_values(a, v->muse);
  1270.         if (!src)
  1271.             fill_array_values(a, v->mdef);
  1272.     } else {
  1273. -       if (version == 0 && reg < prep_regs_count)
  1274. -           return (val_pool[id - 1]);
  1275. -
  1276. -       v = get_value(VLK_REG, id, version);
  1277. +       v = get_value(kind, id);
  1278.     }
  1279.  
  1280.     v->array = a;
  1281. -   v->pin_gpr = v->select;
  1282. -
  1283.     return v;
  1284.  }
  1285.  
  1286. -value* shader::create_temp_value() {
  1287. -   sel_chan id(++next_temp_value_index, 0);
  1288. +value* shader::create_temp_value(int chan) {
  1289. +   sel_chan id(++next_temp_value_index, chan);
  1290.     return get_value(VLK_TEMP, id, 0);
  1291.  }
  1292.  
  1293.  value* shader::get_kcache_value(unsigned bank, unsigned index, unsigned chan) {
  1294.     return get_ro_value(kcache_values, VLK_KCACHE,
  1295. -           sel_chan((bank << 12) | index, chan));
  1296. +       sel_chan((bank << 12) | index, chan));
  1297.  }
  1298.  
  1299.  void shader::add_input(unsigned gpr, bool preloaded, unsigned comp_mask) {
  1300.     if (inputs.size() <= gpr)
  1301. -       inputs.resize(gpr+1);
  1302. +       inputs.resize(gpr + 1);
  1303.  
  1304.     shader_input &i = inputs[gpr];
  1305.     i.preloaded = preloaded;
  1306. @@ -203,7 +231,6 @@ void shader::add_input(unsigned gpr, bool preloaded, unsigned comp_mask) {
  1307.     if (preloaded) {
  1308.         add_pinned_gpr_values(root->dst, gpr, comp_mask, true);
  1309.     }
  1310. -
  1311.  }
  1312.  
  1313.  void shader::init() {
  1314. @@ -216,8 +243,8 @@ void shader::init_call_fs(cf_node* cf) {
  1315.  
  1316.     assert(target == TARGET_VS);
  1317.  
  1318. -   for(inputs_vec::const_iterator I = inputs.begin(),
  1319. -           E = inputs.end(); I != E; ++I, ++gpr) {
  1320. +   for (inputs_vec::const_iterator I = inputs.begin(), E = inputs.end();
  1321. +           I != E; ++I, ++gpr) {
  1322.         if (!I->preloaded)
  1323.             add_pinned_gpr_values(cf->dst, gpr, I->comp_mask, false);
  1324.         else
  1325. @@ -232,7 +259,8 @@ void shader::set_undef(val_set& s) {
  1326.  
  1327.     val_set &vs = s;
  1328.  
  1329. -   for (val_set::iterator I = vs.begin(*this), E = vs.end(*this); I != E; ++I) {
  1330. +   for (val_set::iterator I = vs.begin(*this), E = vs.end(*this); I != E;
  1331. +           ++I) {
  1332.         value *v = *I;
  1333.  
  1334.         assert(!v->is_readonly() && !v->is_rel());
  1335. @@ -267,14 +295,14 @@ alu_node* shader::create_alu() {
  1336.  
  1337.  alu_group_node* shader::create_alu_group() {
  1338.     alu_group_node* n =
  1339. -           new (pool.allocate(sizeof(alu_group_node))) alu_group_node();
  1340. +           new (pool.allocate(sizeof(alu_group_node))) alu_group_node();
  1341.     all_nodes.push_back(n);
  1342.     return n;
  1343.  }
  1344.  
  1345.  alu_packed_node* shader::create_alu_packed() {
  1346.     alu_packed_node* n =
  1347. -           new (pool.allocate(sizeof(alu_packed_node))) alu_packed_node();
  1348. +           new (pool.allocate(sizeof(alu_packed_node))) alu_packed_node();
  1349.     all_nodes.push_back(n);
  1350.     return n;
  1351.  }
  1352. @@ -295,33 +323,34 @@ fetch_node* shader::create_fetch() {
  1353.  }
  1354.  
  1355.  region_node* shader::create_region() {
  1356. -   region_node *n = new (pool.allocate(sizeof(region_node)))
  1357. -           region_node(regions.size());
  1358. +   region_node *n = new (pool.allocate(sizeof(region_node))) region_node(
  1359. +       regions.size());
  1360.     regions.push_back(n);
  1361.     all_nodes.push_back(n);
  1362.     return n;
  1363.  }
  1364.  
  1365.  depart_node* shader::create_depart(region_node* target) {
  1366. -   depart_node* n = new (pool.allocate(sizeof(depart_node)))
  1367. -           depart_node(target, target->departs.size());
  1368. +   depart_node* n = new (pool.allocate(sizeof(depart_node))) depart_node(
  1369. +       target, target->departs.size());
  1370.     target->departs.push_back(n);
  1371.     all_nodes.push_back(n);
  1372.     return n;
  1373.  }
  1374.  
  1375.  repeat_node* shader::create_repeat(region_node* target) {
  1376. -   repeat_node* n = new (pool.allocate(sizeof(repeat_node)))
  1377. -           repeat_node(target, target->repeats.size() + 1);
  1378. +   repeat_node* n = new (pool.allocate(sizeof(repeat_node))) repeat_node(
  1379. +       target, target->repeats.size() + 1);
  1380.     target->repeats.push_back(n);
  1381.     all_nodes.push_back(n);
  1382.     return n;
  1383.  }
  1384.  
  1385.  container_node* shader::create_container(node_type nt, node_subtype nst,
  1386. -                                        node_flags flags) {
  1387. -   container_node *n = new (pool.allocate(sizeof(container_node)))
  1388. -           container_node(nt, nst, flags);
  1389. +                                         node_flags flags) {
  1390. +   container_node *n =
  1391. +           new (pool.allocate(sizeof(container_node))) container_node(nt, nst,
  1392. +               flags);
  1393.     all_nodes.push_back(n);
  1394.     return n;
  1395.  }
  1396. @@ -349,12 +378,12 @@ value* shader::get_const_value(const literal &v) {
  1397.  }
  1398.  
  1399.  shader::~shader() {
  1400. -   for (node_vec::iterator I = all_nodes.begin(), E = all_nodes.end();
  1401. -           I != E; ++I)
  1402. +   for (node_vec::iterator I = all_nodes.begin(), E = all_nodes.end(); I != E;
  1403. +           ++I)
  1404.         (*I)->~node();
  1405.  
  1406. -   for (gpr_array_vec::iterator I = gpr_arrays.begin(), E = gpr_arrays.end();
  1407. -           I != E; ++I) {
  1408. +   for (rel_array_vec::iterator I = rel_arrays.begin(), E = rel_arrays.end();
  1409. +           I != E; ++I) {
  1410.         delete *I;
  1411.     }
  1412.  }
  1413. @@ -376,32 +405,37 @@ value* shader::get_value_version(value* v, unsigned ver) {
  1414.     return vv;
  1415.  }
  1416.  
  1417. -gpr_array* shader::get_gpr_array(unsigned reg, unsigned chan) {
  1418. +rel_array* shader::get_rel_array(value_kind kind, unsigned sel, unsigned chan) {
  1419.  
  1420. -   for (regarray_vec::iterator I = gpr_arrays.begin(),
  1421. -           E = gpr_arrays.end(); I != E; ++I) {
  1422. -       gpr_array* a = *I;
  1423. -       unsigned achan = a->base_gpr.chan();
  1424. -       unsigned areg = a->base_gpr.sel();
  1425. -       if (achan == chan && (reg >= areg && reg < areg+a->array_size))
  1426. +   for (regarray_vec::iterator I = rel_arrays.begin(), E = rel_arrays.end();
  1427. +           I != E; ++I) {
  1428. +       rel_array* a = *I;
  1429. +       if (kind != a->kind)
  1430. +           continue;
  1431. +       unsigned achan = a->base_sel.chan();
  1432. +       unsigned areg = a->base_sel.sel();
  1433. +       if (achan == chan && (sel >= areg && sel < areg + a->array_size))
  1434.             return a;
  1435.     }
  1436.     return NULL;
  1437.  }
  1438.  
  1439. -void shader::add_gpr_array(unsigned gpr_start, unsigned gpr_count,
  1440. -                      unsigned comp_mask) {
  1441. +void shader::add_rel_array(value_kind kind, unsigned sel_start,
  1442. +                           unsigned sel_count, unsigned comp_mask,
  1443. +                           unsigned array_id) {
  1444.     unsigned chan = 0;
  1445.     while (comp_mask) {
  1446.         if (comp_mask & 1) {
  1447. -           gpr_array *a = new gpr_array(
  1448. -                   sel_chan(gpr_start, chan), gpr_count);
  1449. +           rel_array *a = new rel_array(kind, sel_chan(sel_start, chan),
  1450. +               sel_count, array_id);
  1451.  
  1452. -           SB_DUMP_PASS( sblog << "add_gpr_array: @" << a->base_gpr
  1453. -                    << " [" << a->array_size << "]\n";
  1454. +           SB_DUMP_PASS(
  1455. +               sblog << "add_gpr_array: @" << a->base_sel << " ["
  1456. +                     << a->array_size << "]\n"
  1457. +               ;
  1458.             );
  1459.  
  1460. -           gpr_arrays.push_back(a);
  1461. +           rel_arrays.push_back(a);
  1462.         }
  1463.         comp_mask >>= 1;
  1464.         ++chan;
  1465. @@ -434,13 +468,18 @@ std::string shader::get_full_target_name() {
  1466.  
  1467.  const char* shader::get_shader_target_name() {
  1468.     switch (target) {
  1469. -       case TARGET_VS: return "VS";
  1470. -       case TARGET_PS: return "PS";
  1471. -       case TARGET_GS: return "GS";
  1472. -       case TARGET_COMPUTE: return "COMPUTE";
  1473. -       case TARGET_FETCH: return "FETCH";
  1474. -       default:
  1475. -           return "INVALID_TARGET";
  1476. +   case TARGET_VS:
  1477. +       return "VS";
  1478. +   case TARGET_PS:
  1479. +       return "PS";
  1480. +   case TARGET_GS:
  1481. +       return "GS";
  1482. +   case TARGET_COMPUTE:
  1483. +       return "COMPUTE";
  1484. +   case TARGET_FETCH:
  1485. +       return "FETCH";
  1486. +   default:
  1487. +       return "INVALID_TARGET";
  1488.     }
  1489.  }
  1490.  
  1491. @@ -457,7 +496,6 @@ void shader::simplify_dep_rep(node* dr) {
  1492.         dr->parent->cut(dr->next, NULL);
  1493.  }
  1494.  
  1495. -
  1496.  // FIXME this is used in some places as the max non-temp gpr,
  1497.  // (MAX_GPR - 2 * ctx.alu_temp_gprs) should be used for that instead.
  1498.  unsigned shader::first_temp_gpr() {
  1499. @@ -529,10 +567,8 @@ void shader::create_bbs(container_node* n, bbs_vec &bbs, int loop_level) {
  1500.         if (inside_bb && !last_inside_bb)
  1501.             bb_start = I;
  1502.         else if (!inside_bb) {
  1503. -           if (last_inside_bb
  1504. -                   && I->type != NT_REPEAT
  1505. -                   && I->type != NT_DEPART
  1506. -                   && I->type != NT_IF) {
  1507. +           if (last_inside_bb && I->type != NT_REPEAT && I->type != NT_DEPART
  1508. +                   && I->type != NT_IF) {
  1509.                 bb_node *bb = create_bb(bbs.size(), loop_level);
  1510.                 bbs.push_back(bb);
  1511.                 n->insert_node_before(*bb_start, bb);
  1512. @@ -548,7 +584,7 @@ void shader::create_bbs(container_node* n, bbs_vec &bbs, int loop_level) {
  1513.                 }
  1514.  
  1515.                 create_bbs(static_cast<container_node*>(k), bbs,
  1516. -                          loop_level + loop);
  1517. +                   loop_level + loop);
  1518.             }
  1519.         }
  1520.  
  1521. @@ -562,7 +598,7 @@ void shader::create_bbs(container_node* n, bbs_vec &bbs, int loop_level) {
  1522.         bb_node *bb = create_bb(bbs.size(), loop_level);
  1523.         bbs.push_back(bb);
  1524.         if (n->empty())
  1525. -               n->push_back(bb);
  1526. +           n->push_back(bb);
  1527.         else {
  1528.             n->insert_node_before(*bb_start, bb);
  1529.             if (bb_start != n->end())
  1530. @@ -587,22 +623,22 @@ void shader::expand_bbs(bbs_vec &bbs) {
  1531.  
  1532.  sched_queue_id shader::get_queue_id(node* n) {
  1533.     switch (n->subtype) {
  1534. -       case NST_ALU_INST:
  1535. -       case NST_ALU_PACKED_INST:
  1536. -       case NST_COPY:
  1537. -       case NST_PSI:
  1538. -           return SQ_ALU;
  1539. -       case NST_FETCH_INST: {
  1540. -           fetch_node *f = static_cast<fetch_node*>(n);
  1541. -           if (ctx.is_r600() && (f->bc.op_ptr->flags & FF_VTX))
  1542. -               return SQ_VTX;
  1543. -           return SQ_TEX;
  1544. -       }
  1545. -       case NST_CF_INST:
  1546. -           return SQ_CF;
  1547. -       default:
  1548. -           assert(0);
  1549. -           return SQ_NUM;
  1550. +   case NST_ALU_INST:
  1551. +   case NST_ALU_PACKED_INST:
  1552. +   case NST_COPY:
  1553. +   case NST_PSI:
  1554. +       return SQ_ALU;
  1555. +   case NST_FETCH_INST: {
  1556. +       fetch_node *f = static_cast<fetch_node*>(n);
  1557. +       if (ctx.is_r600() && (f->bc.op_ptr->flags & FF_VTX))
  1558. +           return SQ_VTX;
  1559. +       return SQ_TEX;
  1560. +   }
  1561. +   case NST_CF_INST:
  1562. +       return SQ_CF;
  1563. +   default:
  1564. +       assert(0);
  1565. +       return SQ_NUM;
  1566.     }
  1567.  }
  1568.  
  1569. @@ -647,10 +683,9 @@ void shader_stats::accumulate(shader_stats& s) {
  1570.  
  1571.  void shader_stats::dump() {
  1572.     sblog << "dw:" << ndw << ", gpr:" << ngpr << ", stk:" << nstack
  1573. -           << ", alu groups:" << alu_groups << ", alu clauses: " << alu_clauses
  1574. -           << ", alu:" << alu << ", fetch:" << fetch
  1575. -           << ", fetch clauses:" << fetch_clauses
  1576. -           << ", cf:" << cf;
  1577. +         << ", alu groups:" << alu_groups << ", alu clauses: " << alu_clauses
  1578. +         << ", alu:" << alu << ", fetch:" << fetch << ", fetch clauses:"
  1579. +         << fetch_clauses << ", cf:" << cf;
  1580.  
  1581.     if (shaders > 1)
  1582.         sblog << ", shaders:" << shaders;
  1583. @@ -660,7 +695,7 @@ void shader_stats::dump() {
  1584.  
  1585.  static void print_diff(unsigned d1, unsigned d2) {
  1586.     if (d1)
  1587. -       sblog << ((int)d2 - (int)d1) * 100 / (int)d1 << "%";
  1588. +       sblog << ((int) d2 - (int) d1) * 100 / (int) d1 << "%";
  1589.     else if (d2)
  1590.         sblog << "N/A";
  1591.     else
  1592. @@ -668,15 +703,24 @@ static void print_diff(unsigned d1, unsigned d2) {
  1593.  }
  1594.  
  1595.  void shader_stats::dump_diff(shader_stats& s) {
  1596. -   sblog << "dw:"; print_diff(ndw, s.ndw);
  1597. -   sblog << ", gpr:" ; print_diff(ngpr, s.ngpr);
  1598. -   sblog << ", stk:" ; print_diff(nstack, s.nstack);
  1599. -   sblog << ", alu groups:" ; print_diff(alu_groups, s.alu_groups);
  1600. -   sblog << ", alu clauses: " ; print_diff(alu_clauses, s.alu_clauses);
  1601. -   sblog << ", alu:" ; print_diff(alu, s.alu);
  1602. -   sblog << ", fetch:" ; print_diff(fetch, s.fetch);
  1603. -   sblog << ", fetch clauses:" ; print_diff(fetch_clauses, s.fetch_clauses);
  1604. -   sblog << ", cf:" ; print_diff(cf, s.cf);
  1605. +   sblog << "dw:";
  1606. +   print_diff(ndw, s.ndw);
  1607. +   sblog << ", gpr:";
  1608. +   print_diff(ngpr, s.ngpr);
  1609. +   sblog << ", stk:";
  1610. +   print_diff(nstack, s.nstack);
  1611. +   sblog << ", alu groups:";
  1612. +   print_diff(alu_groups, s.alu_groups);
  1613. +   sblog << ", alu clauses: ";
  1614. +   print_diff(alu_clauses, s.alu_clauses);
  1615. +   sblog << ", alu:";
  1616. +   print_diff(alu, s.alu);
  1617. +   sblog << ", fetch:";
  1618. +   print_diff(fetch, s.fetch);
  1619. +   sblog << ", fetch clauses:";
  1620. +   print_diff(fetch_clauses, s.fetch_clauses);
  1621. +   sblog << ", cf:";
  1622. +   print_diff(cf, s.cf);
  1623.     sblog << "\n";
  1624.  }
  1625.  
  1626. diff --git a/src/gallium/drivers/r600/sb/sb_shader.h b/src/gallium/drivers/r600/sb/sb_shader.h
  1627. index e515d31..abc2d6b 100644
  1628. --- a/src/gallium/drivers/r600/sb/sb_shader.h
  1629. +++ b/src/gallium/drivers/r600/sb/sb_shader.h
  1630. @@ -52,7 +52,7 @@ typedef std::multimap<node*, error_info> error_map;
  1631.  class sb_context;
  1632.  
  1633.  typedef std::vector<shader_input> inputs_vec;
  1634. -typedef std::vector<gpr_array*> gpr_array_vec;
  1635. +typedef std::vector<rel_array*> rel_array_vec;
  1636.  
  1637.  struct ra_edge {
  1638.     value *a, *b;
  1639. @@ -234,7 +234,7 @@ private:
  1640.     ra_chunk* detach_value(value *v);
  1641.  };
  1642.  
  1643. -
  1644. +// =============================================================================
  1645.  
  1646.  class shader {
  1647.  
  1648. @@ -248,12 +248,10 @@ class shader {
  1649.     value_map special_ro_values; //  key - hw alu_sel & chan
  1650.     value_map kcache_values;
  1651.  
  1652. -   gpr_array_vec gpr_arrays;
  1653. +   rel_array_vec rel_arrays;
  1654.  
  1655.     unsigned next_temp_value_index;
  1656.  
  1657. -   unsigned prep_regs_count;
  1658. -
  1659.     value* pred_sels[2];
  1660.  
  1661.     regions_vec regions;
  1662. @@ -266,6 +264,8 @@ class shader {
  1663.  
  1664.     std::vector<node*> all_nodes;
  1665.  
  1666. +   bytecode bc;
  1667. +
  1668.  public:
  1669.     shader_stats src_stats, opt_stats;
  1670.  
  1671. @@ -277,7 +277,7 @@ public:
  1672.  
  1673.     coalescer coal;
  1674.  
  1675. -   static const unsigned temp_regid_offset = 512;
  1676. +   static const unsigned temp_regid_offset = 0;
  1677.  
  1678.     bbs_vec bbs;
  1679.  
  1680. @@ -289,26 +289,33 @@ public:
  1681.     container_node *root;
  1682.  
  1683.     bool compute_interferences;
  1684. -
  1685.     bool has_alu_predication;
  1686.     bool uses_gradients;
  1687. -
  1688.     bool safe_math;
  1689.  
  1690.     unsigned ngpr, nstack;
  1691.  
  1692. -   shader(sb_context &sctx, shader_target t, unsigned id);
  1693. +   bool direct_tgsi;
  1694. +
  1695. +   shader(sb_context &sctx, shader_target t, unsigned id,
  1696. +          bool direct_tgsi = false);
  1697.  
  1698.     ~shader();
  1699.  
  1700. +   bytecode& get_bytecode() { return bc; }
  1701. +
  1702.     sb_context &get_ctx() const { return ctx; }
  1703.  
  1704.     value* get_const_value(const literal & v);
  1705.     value* get_special_value(unsigned sv_id, unsigned version = 0);
  1706. -   value* create_temp_value();
  1707. -   value* get_gpr_value(bool src, unsigned reg, unsigned chan, bool rel,
  1708. -                         unsigned version = 0);
  1709. +   value* create_temp_value(int chan = 0);
  1710. +   value* get_reg_value(value_kind kind, bool src, unsigned reg,
  1711. +                        unsigned chan, bool rel, value *r = NULL,
  1712. +                         unsigned arr_id = 0);
  1713.  
  1714. +   value* get_gpr_value(bool src, unsigned reg, unsigned chan, bool rel) {
  1715. +       return get_reg_value(VLK_REG, src, reg, chan, rel);
  1716. +   }
  1717.  
  1718.     value* get_special_ro_value(unsigned sel);
  1719.     value* get_kcache_value(unsigned bank, unsigned index, unsigned chan);
  1720. @@ -316,17 +323,22 @@ public:
  1721.     value* get_value_version(value* v, unsigned ver);
  1722.  
  1723.     void init();
  1724. -   void add_pinned_gpr_values(vvec& vec, unsigned gpr, unsigned comp_mask, bool src);
  1725. +   void add_pinned_gpr_values(vvec& vec, unsigned gpr, unsigned comp_mask,
  1726. +                              bool src);
  1727. +
  1728. +   void add_pinned_inputs(vvec& vec, value_kind kind, unsigned sel,
  1729. +                                  unsigned comp_mask, bool src,
  1730. +                                  unsigned pin_gpr_sel);
  1731.  
  1732.     void dump_ir();
  1733.  
  1734. -   void add_gpr_array(unsigned gpr_start, unsigned gpr_count,
  1735. -                      unsigned comp_mask);
  1736. +   void add_rel_array(value_kind kind, unsigned sel_start, unsigned sel_count,
  1737. +                      unsigned comp_mask, unsigned array_id = 0);
  1738.  
  1739.     value* get_pred_sel(int sel);
  1740.     bool assign_slot(alu_node *n, alu_node *slots[5]);
  1741.  
  1742. -   gpr_array* get_gpr_array(unsigned reg, unsigned chan);
  1743. +   rel_array* get_rel_array(value_kind kind, unsigned sel, unsigned chan);
  1744.  
  1745.     void add_input(unsigned gpr, bool preloaded = false,
  1746.                    unsigned comp_mask = 0xF);
  1747. @@ -381,11 +393,11 @@ public:
  1748.     unsigned first_temp_gpr();
  1749.     unsigned num_nontemp_gpr();
  1750.  
  1751. -   gpr_array_vec& arrays() { return gpr_arrays; }
  1752. +   rel_array_vec& arrays() { return rel_arrays; }
  1753.  
  1754.     void set_uses_kill();
  1755.  
  1756. -   void fill_array_values(gpr_array *a, vvec &vv);
  1757. +   void fill_array_values(rel_array *a, vvec &vv);
  1758.  
  1759.     alu_node* clone(alu_node *n);
  1760.  
  1761. @@ -393,10 +405,11 @@ public:
  1762.  
  1763.     void collect_stats(bool opt);
  1764.  
  1765. -private:
  1766. -   value* create_value(value_kind k, sel_chan regid, unsigned ver);
  1767.     value* get_value(value_kind kind, sel_chan id,
  1768.                              unsigned version = 0);
  1769. +
  1770. +private:
  1771. +   value* create_value(value_kind k, sel_chan regid, unsigned ver);
  1772.     value* get_ro_value(value_map &vm, value_kind vk, unsigned key);
  1773.  };
  1774.  
  1775. diff --git a/src/gallium/drivers/r600/sb/sb_ssa_builder.cpp b/src/gallium/drivers/r600/sb/sb_ssa_builder.cpp
  1776. index 3ad628b..6df2979 100644
  1777. --- a/src/gallium/drivers/r600/sb/sb_ssa_builder.cpp
  1778. +++ b/src/gallium/drivers/r600/sb/sb_ssa_builder.cpp
  1779. @@ -201,8 +201,11 @@ bool ssa_rename::visit(alu_node& n, bool enter) {
  1780.  
  1781.         if (!n.dst.empty() && n.dst[0]) {
  1782.             // FIXME probably use separate pass for such things
  1783. -           if ((n.bc.op_ptr->flags & AF_INTERP) || n.bc.op == ALU_OP2_CUBE)
  1784. +           if ((n.bc.op_ptr->flags & AF_INTERP) || n.bc.op == ALU_OP2_CUBE) {
  1785.                 n.dst[0]->flags |= VLF_PIN_CHAN;
  1786. +               n.dst[0]->pin_gpr = sel_chan(n.dst[0]->pin_gpr.sel(),
  1787. +                   n.bc.slot);
  1788. +           }
  1789.         }
  1790.     }
  1791.     return true;
  1792. diff --git a/src/gallium/drivers/r600/sb/sb_tgsi.cpp b/src/gallium/drivers/r600/sb/sb_tgsi.cpp
  1793. new file mode 100644
  1794. index 0000000..361323d
  1795. --- /dev/null
  1796. +++ b/src/gallium/drivers/r600/sb/sb_tgsi.cpp
  1797. @@ -0,0 +1,2335 @@
  1798. +/*
  1799. + * Copyright 2013 Vadim Girlin <vadimgirlin@gmail.com>
  1800. + *
  1801. + * Permission is hereby granted, free of charge, to any person obtaining a
  1802. + * copy of this software and associated documentation files (the "Software"),
  1803. + * to deal in the Software without restriction, including without limitation
  1804. + * on the rights to use, copy, modify, merge, publish, distribute, sub
  1805. + * license, and/or sell copies of the Software, and to permit persons to whom
  1806. + * the Software is furnished to do so, subject to the following conditions:
  1807. + *
  1808. + * The above copyright notice and this permission notice (including the next
  1809. + * paragraph) shall be included in all copies or substantial portions of the
  1810. + * Software.
  1811. + *
  1812. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  1813. + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  1814. + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  1815. + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  1816. + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  1817. + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  1818. + * USE OR OTHER DEALINGS IN THE SOFTWARE.
  1819. + *
  1820. + * Authors:
  1821. + *      Vadim Girlin
  1822. + */
  1823. +
  1824. +extern "C" {
  1825. +#include "r600_shader.h"
  1826. +#include "r600_formats.h"
  1827. +}
  1828. +
  1829. +#include "sb_shader.h"
  1830. +#include "sb_tgsi.h"
  1831. +
  1832. +namespace r600_sb {
  1833. +
  1834. +alu_src tgsi_translator::null_alu_src = alu_src();
  1835. +
  1836. +#define TI_DESC(op, isa_op, func, flags) \
  1837. +   {#op, TGSI_OPCODE_##op, isa_op, &tgsi_translator::func, flags}
  1838. +
  1839. +#define TI_GAP {}
  1840. +
  1841. +const tgsi_translator::tgsi_inst_info tgsi_translator::tgsi_info_table[TGSI_OPCODE_LAST] =
  1842. +        {
  1843. +        /*   0 */ TI_DESC(ARL, 0, ti_arl, 0),
  1844. +        /*   1 */ TI_DESC(MOV, ALU_OP1_MOV, ti_alu, 0),
  1845. +        /*   2 */ TI_DESC(LIT, 0, ti_lit, 0),
  1846. +        /*   3 */ TI_DESC(RCP, ALU_OP1_RECIP_IEEE, ti_repl, 0),
  1847. +        /*   4 */ TI_DESC(RSQ, ALU_OP1_RECIPSQRT_CLAMPED, ti_repl, 0),
  1848. +        /*   5 */ TI_DESC(EXP, 0, ti_exp, 0),
  1849. +        /*   6 */ TI_DESC(LOG, 0, ti_log, 0),
  1850. +        /*   7 */ TI_DESC(MUL, ALU_OP2_MUL, ti_alu, 0),
  1851. +        /*   8 */ TI_DESC(ADD, ALU_OP2_ADD, ti_alu, 0),
  1852. +        /*   9 */ TI_DESC(DP3, 0, ti_dot, 0),
  1853. +        /*  10 */ TI_DESC(DP4, 0, ti_dot, 0),
  1854. +        /*  11 */ TI_DESC(DST, 0, ti_dst, 0),
  1855. +        /*  12 */ TI_DESC(MIN, ALU_OP2_MIN, ti_alu, 0),
  1856. +        /*  13 */ TI_DESC(MAX, ALU_OP2_MAX, ti_alu, 0),
  1857. +        /*  14 */ TI_DESC(SLT, ALU_OP2_SETGT, ti_alu, TIF_ALU_SWAPSRC01),
  1858. +        /*  15 */ TI_DESC(SGE, ALU_OP2_SETGE, ti_alu, 0),
  1859. +        /*  16 */ TI_DESC(MAD, ALU_OP3_MULADD, ti_alu, 0),
  1860. +        /*  17 */ TI_DESC(SUB, ALU_OP2_ADD, ti_alu, 0),
  1861. +        /*  18 */ TI_DESC(LRP, 0, ti_lrp, 0),
  1862. +        /*  19 */ TI_DESC(CND, 0, ti_unsupported, 0),
  1863. +        /*  20 */ TI_DESC(SQRT, 0, ti_unsupported, 0),
  1864. +        /*  21 */ TI_DESC(DP2A, 0, ti_unsupported, 0),
  1865. +        /*  22 */ TI_GAP,
  1866. +        /*  23 */ TI_GAP,
  1867. +        /*  24 */ TI_DESC(FRC, ALU_OP1_FRACT, ti_alu, 0),
  1868. +        /*  25 */ TI_DESC(CLAMP, 0, ti_unsupported, 0),
  1869. +        /*  26 */ TI_DESC(FLR, ALU_OP1_FLOOR, ti_alu, 0),
  1870. +        /*  27 */ TI_DESC(ROUND, ALU_OP1_RNDNE, ti_alu, 0),
  1871. +        /*  28 */ TI_DESC(EX2, ALU_OP1_EXP_IEEE, ti_repl, 0),
  1872. +        /*  29 */ TI_DESC(LG2, ALU_OP1_LOG_IEEE, ti_repl, 0),
  1873. +        /*  30 */ TI_DESC(POW, 0, ti_pow, 0),
  1874. +        /*  31 */ TI_DESC(XPD, 0, ti_xpd, 0),
  1875. +        /*  32 */ TI_GAP,
  1876. +        /*  33 */ TI_DESC(ABS, ALU_OP1_MOV, ti_alu, 0),
  1877. +        /*  34 */ TI_DESC(RCC, 0, ti_unsupported, 0),
  1878. +        /*  35 */ TI_DESC(DPH, 0, ti_dot, 0),
  1879. +        /*  36 */ TI_DESC(COS, ALU_OP1_COS, ti_trig, 0),
  1880. +        /*  37 */ TI_DESC(DDX, FETCH_OP_GET_GRADIENTS_H, ti_tex, 0),
  1881. +        /*  38 */ TI_DESC(DDY, FETCH_OP_GET_GRADIENTS_V, ti_tex, 0),
  1882. +        /*  39 */ TI_DESC(KILL, 0, ti_kill, 0),
  1883. +        /*  40 */ TI_DESC(PK2H, 0, ti_unsupported, 0),
  1884. +        /*  41 */ TI_DESC(PK2US, 0, ti_unsupported, 0),
  1885. +        /*  42 */ TI_DESC(PK4B, 0, ti_unsupported, 0),
  1886. +        /*  43 */ TI_DESC(PK4UB, 0, ti_unsupported, 0),
  1887. +        /*  44 */ TI_DESC(RFL, 0, ti_unsupported, 0),
  1888. +        /*  45 */ TI_DESC(SEQ, ALU_OP2_SETE, ti_alu, 0),
  1889. +        /*  46 */ TI_DESC(SFL, 0, ti_unsupported, 0),
  1890. +        /*  47 */ TI_DESC(SGT, ALU_OP2_SETGT, ti_alu, 0),
  1891. +        /*  48 */ TI_DESC(SIN, ALU_OP1_SIN, ti_trig, 0),
  1892. +        /*  49 */ TI_DESC(SLE, ALU_OP2_SETGE, ti_alu, TIF_ALU_SWAPSRC01),
  1893. +        /*  50 */ TI_DESC(SNE, ALU_OP2_SETNE, ti_alu, 0),
  1894. +        /*  51 */ TI_DESC(STR, 0, ti_unsupported, 0),
  1895. +        /*  52 */ TI_DESC(TEX, FETCH_OP_SAMPLE, ti_tex, 0),
  1896. +        /*  53 */ TI_DESC(TXD, FETCH_OP_SAMPLE_G, ti_tex, 0),
  1897. +        /*  54 */ TI_DESC(TXP, FETCH_OP_SAMPLE, ti_tex, 0),
  1898. +        /*  55 */ TI_DESC(UP2H, 0, ti_unsupported, 0),
  1899. +        /*  56 */ TI_DESC(UP2US, 0, ti_unsupported, 0),
  1900. +        /*  57 */ TI_DESC(UP4B, 0, ti_unsupported, 0),
  1901. +        /*  58 */ TI_DESC(UP4UB, 0, ti_unsupported, 0),
  1902. +        /*  59 */ TI_DESC(X2D, 0, ti_unsupported, 0),
  1903. +        /*  60 */ TI_DESC(ARA, 0, ti_unsupported, 0),
  1904. +        /*  61 */ TI_DESC(ARR, 0, ti_arl, 0),
  1905. +        /*  62 */ TI_DESC(BRA, 0, ti_unsupported, 0),
  1906. +        /*  63 */ TI_DESC(CAL, 0, ti_unsupported, 0),
  1907. +        /*  64 */ TI_DESC(RET, 0, ti_unsupported, 0),
  1908. +        /*  65 */ TI_DESC(SSG, 0, ti_ssg, 0),
  1909. +        /*  66 */ TI_DESC(CMP, 0, ti_cmp, 0),
  1910. +        /*  67 */ TI_DESC(SCS, 0, ti_scs, 0),
  1911. +        /*  68 */ TI_DESC(TXB, FETCH_OP_SAMPLE_LB, ti_tex, 0),
  1912. +        /*  69 */ TI_DESC(NRM, 0, ti_unsupported, 0),
  1913. +        /*  70 */ TI_DESC(DIV, 0, ti_unsupported, 0),
  1914. +        /*  71 */ TI_DESC(DP2, 0, ti_dot, 0),
  1915. +        /*  72 */ TI_DESC(TXL, FETCH_OP_SAMPLE_L, ti_tex, 0),
  1916. +        /*  73 */ TI_DESC(BRK, CF_OP_LOOP_BREAK, ti_loop_op, 0),
  1917. +        /*  74 */ TI_DESC(IF, ALU_OP2_PRED_SETNE, ti_if, 0),
  1918. +        /*  75 */ TI_DESC(UIF, ALU_OP2_PRED_SETNE_INT, ti_if, 0),
  1919. +        /*  76 */ TI_GAP,
  1920. +        /*  77 */ TI_DESC(ELSE, 0, ti_else, 0),
  1921. +        /*  78 */ TI_DESC(ENDIF, 0, ti_endif, 0),
  1922. +        /*  79 */ TI_GAP,
  1923. +        /*  80 */ TI_GAP,
  1924. +        /*  81 */ TI_DESC(PUSHA, 0, ti_unsupported, 0),
  1925. +        /*  82 */ TI_DESC(POPA, 0, ti_unsupported, 0),
  1926. +        /*  83 */ TI_DESC(CEIL, ALU_OP1_CEIL, ti_alu, 0),
  1927. +        /*  84 */ TI_DESC(I2F, ALU_OP1_INT_TO_FLT, ti_alu, 0),
  1928. +        /*  85 */ TI_DESC(NOT, ALU_OP1_NOT_INT, ti_alu, 0),
  1929. +        /*  86 */ TI_DESC(TRUNC, ALU_OP1_TRUNC, ti_alu, 0),
  1930. +        /*  87 */ TI_DESC(SHL, ALU_OP2_LSHL_INT, ti_alu, 0),
  1931. +        /*  88 */ TI_GAP,
  1932. +        /*  89 */ TI_DESC(AND, ALU_OP2_AND_INT, ti_alu, 0),
  1933. +        /*  90 */ TI_DESC(OR, ALU_OP2_OR_INT, ti_alu, 0),
  1934. +        /*  91 */ TI_DESC(MOD, 0, ti_divmod, 0),
  1935. +        /*  92 */ TI_DESC(XOR, ALU_OP2_XOR_INT, ti_alu, 0),
  1936. +        /*  93 */ TI_DESC(SAD, 0, ti_unsupported, 0),
  1937. +        /*  94 */ TI_DESC(TXF, FETCH_OP_LD, ti_tex, 0),
  1938. +        /*  95 */ TI_DESC(TXQ, FETCH_OP_GET_TEXTURE_RESINFO, ti_tex, 0),
  1939. +        /*  96 */ TI_DESC(CONT, CF_OP_LOOP_CONTINUE, ti_loop_op, 0),
  1940. +        /*  97 */ TI_DESC(EMIT, 0, ti_unsupported, 0),
  1941. +        /*  98 */ TI_DESC(ENDPRIM, 0, ti_unsupported, 0),
  1942. +        /*  99 */ TI_DESC(BGNLOOP, 0, ti_begin_loop, 0),
  1943. +        /* 100 */ TI_DESC(BGNSUB, 0, ti_unsupported, 0),
  1944. +        /* 101 */ TI_DESC(ENDLOOP, 0, ti_end_loop, 0),
  1945. +        /* 102 */ TI_DESC(ENDSUB, 0, ti_unsupported, 0),
  1946. +        /* 103 */ TI_DESC(TXQ_LZ, FETCH_OP_GET_TEXTURE_RESINFO, ti_tex, 0),
  1947. +        /* 104 */ TI_GAP,
  1948. +        /* 105 */ TI_GAP,
  1949. +        /* 106 */ TI_GAP,
  1950. +        /* 107 */ TI_DESC(NOP, 0, ti_unsupported, 0),
  1951. +        /* 108 */ TI_GAP,
  1952. +        /* 109 */ TI_GAP,
  1953. +        /* 110 */ TI_GAP,
  1954. +        /* 111 */ TI_GAP,
  1955. +        /* 112 */ TI_DESC(NRM4, 0, ti_unsupported, 0),
  1956. +        /* 113 */ TI_DESC(CALLNZ, 0, ti_unsupported, 0),
  1957. +        /* 114 */ TI_GAP,
  1958. +        /* 115 */ TI_DESC(BREAKC, 0, ti_unsupported, 0),
  1959. +        /* 116 */ TI_DESC(KILL_IF, 0, ti_kill, 0),
  1960. +        /* 117 */ TI_DESC(END, 0, ti_unsupported, 0),
  1961. +        /* 118 */ TI_GAP,
  1962. +        /* 119 */ TI_DESC(F2I, ALU_OP1_FLT_TO_INT, ti_f2iu, 0),
  1963. +        /* 120 */ TI_DESC(IDIV, 0, ti_divmod, 0),
  1964. +        /* 121 */ TI_DESC(IMAX, ALU_OP2_MAX_INT, ti_alu, 0),
  1965. +        /* 122 */ TI_DESC(IMIN, ALU_OP2_MIN_INT, ti_alu, 0),
  1966. +        /* 123 */ TI_DESC(INEG, 0, ti_ineg, 0),
  1967. +        /* 124 */ TI_DESC(ISGE, ALU_OP2_SETGE_INT, ti_alu, 0),
  1968. +        /* 125 */ TI_DESC(ISHR, ALU_OP2_ASHR_INT, ti_alu, 0),
  1969. +        /* 126 */ TI_DESC(ISLT, ALU_OP2_SETGT_INT, ti_alu, TIF_ALU_SWAPSRC01),
  1970. +        /* 127 */ TI_DESC(F2U, ALU_OP1_FLT_TO_UINT, ti_f2iu, 0),
  1971. +        /* 128 */ TI_DESC(U2F, ALU_OP1_UINT_TO_FLT, ti_alu, 0),
  1972. +        /* 129 */ TI_DESC(UADD, ALU_OP2_ADD_INT, ti_alu, 0),
  1973. +        /* 130 */ TI_DESC(UDIV, 0, ti_divmod, 0),
  1974. +        /* 131 */ TI_DESC(UMAD, 0, ti_umad, 0),
  1975. +        /* 132 */ TI_DESC(UMAX, ALU_OP2_MAX_UINT, ti_alu, 0),
  1976. +        /* 133 */ TI_DESC(UMIN, ALU_OP2_MIN_UINT, ti_alu, 0),
  1977. +        /* 134 */ TI_DESC(UMOD, 0, ti_divmod, 0),
  1978. +        /* 135 */ TI_DESC(UMUL, ALU_OP2_MULLO_INT, ti_alu, 0),
  1979. +        /* 136 */ TI_DESC(USEQ, ALU_OP2_SETE_INT, ti_alu, 0),
  1980. +        /* 137 */ TI_DESC(USGE, ALU_OP2_SETGE_UINT, ti_alu, 0),
  1981. +        /* 138 */ TI_DESC(USHR, ALU_OP2_LSHR_INT, ti_alu, 0),
  1982. +        /* 139 */ TI_DESC(USLT, ALU_OP2_SETGT_UINT, ti_alu, TIF_ALU_SWAPSRC01),
  1983. +        /* 140 */ TI_DESC(USNE, ALU_OP2_SETNE_INT, ti_alu, 0),
  1984. +        /* 141 */ TI_DESC(SWITCH, 0, ti_unsupported, 0),
  1985. +        /* 142 */ TI_DESC(CASE, 0, ti_unsupported, 0),
  1986. +        /* 143 */ TI_DESC(DEFAULT, 0, ti_unsupported, 0),
  1987. +        /* 144 */ TI_DESC(ENDSWITCH, 0, ti_unsupported, 0),
  1988. +        /* 145 */ TI_DESC(SAMPLE, 0, ti_unsupported, 0),
  1989. +        /* 146 */ TI_DESC(SAMPLE_I, 0, ti_unsupported, 0),
  1990. +        /* 147 */ TI_DESC(SAMPLE_I_MS, 0, ti_unsupported, 0),
  1991. +        /* 148 */ TI_DESC(SAMPLE_B, 0, ti_unsupported, 0),
  1992. +        /* 149 */ TI_DESC(SAMPLE_C, 0, ti_unsupported, 0),
  1993. +        /* 150 */ TI_DESC(SAMPLE_C_LZ, 0, ti_unsupported, 0),
  1994. +        /* 151 */ TI_DESC(SAMPLE_D, 0, ti_unsupported, 0),
  1995. +        /* 152 */ TI_DESC(SAMPLE_L, 0, ti_unsupported, 0),
  1996. +        /* 153 */ TI_DESC(GATHER4, 0, ti_unsupported, 0),
  1997. +        /* 154 */ TI_DESC(SVIEWINFO, 0, ti_unsupported, 0),
  1998. +        /* 155 */ TI_DESC(SAMPLE_POS, 0, ti_unsupported, 0),
  1999. +        /* 156 */ TI_DESC(SAMPLE_INFO, 0, ti_unsupported, 0),
  2000. +        /* 157 */ TI_DESC(UARL, 0, ti_arl, 0),
  2001. +        /* 158 */ TI_DESC(UCMP, 0, ti_cmp, 0),
  2002. +        /* 159 */ TI_DESC(IABS, 0, ti_iabs, 0),
  2003. +        /* 160 */ TI_DESC(ISSG, 0, ti_ssg, 0),
  2004. +        /* 161 */ TI_DESC(LOAD, 0, ti_unsupported, 0),
  2005. +        /* 162 */ TI_DESC(STORE, 0, ti_unsupported, 0),
  2006. +        /* 163 */ TI_DESC(MFENCE, 0, ti_unsupported, 0),
  2007. +        /* 164 */ TI_DESC(LFENCE, 0, ti_unsupported, 0),
  2008. +        /* 165 */ TI_DESC(SFENCE, 0, ti_unsupported, 0),
  2009. +        /* 166 */ TI_DESC(BARRIER, 0, ti_unsupported, 0),
  2010. +        /* 167 */ TI_DESC(ATOMUADD, 0, ti_unsupported, 0),
  2011. +        /* 168 */ TI_DESC(ATOMXCHG, 0, ti_unsupported, 0),
  2012. +        /* 169 */ TI_DESC(ATOMCAS, 0, ti_unsupported, 0),
  2013. +        /* 170 */ TI_DESC(ATOMAND, 0, ti_unsupported, 0),
  2014. +        /* 171 */ TI_DESC(ATOMOR, 0, ti_unsupported, 0),
  2015. +        /* 172 */ TI_DESC(ATOMXOR, 0, ti_unsupported, 0),
  2016. +        /* 173 */ TI_DESC(ATOMUMIN, 0, ti_unsupported, 0),
  2017. +        /* 174 */ TI_DESC(ATOMUMAX, 0, ti_unsupported, 0),
  2018. +        /* 175 */ TI_DESC(ATOMIMIN, 0, ti_unsupported, 0),
  2019. +        /* 176 */ TI_DESC(ATOMIMAX, 0, ti_unsupported, 0),
  2020. +        /* 177 */ TI_DESC(TEX2, FETCH_OP_SAMPLE, ti_tex, 0),
  2021. +        /* 178 */ TI_DESC(TXB2, FETCH_OP_SAMPLE_LB, ti_tex, 0),
  2022. +        /* 179 */ TI_DESC(TXL2, FETCH_OP_SAMPLE_L, ti_tex, 0)
  2023. +        /* 180 */ /* TI_DESC(LAST, 0, ti_unsupported, 0) */
  2024. +        };
  2025. +
  2026. +#undef TI_DESC
  2027. +#undef TI_GAP
  2028. +
  2029. +#define FILLV4(a, b) a[0]=a[1]=a[2]=a[3]=b
  2030. +#define VSWZ_XYZW(a) a[0] = 0; a[1] = 1; a[2] = 2; a[3] = 3
  2031. +#define VSWZ_MASK(a) FILLV4(a, SEL_MASK)
  2032. +#define VSWZ_INIT(a, s0, s1, s2, s3) a[0]=s0; a[1]=s1, a[2]=s2; a[3]=s3
  2033. +
  2034. +shader* tgsi_translator::translate() {
  2035. +   shader_target target;
  2036. +   int r;
  2037. +
  2038. +   tokens = ps->selector->tokens;
  2039. +   tgsi_parse_init(&parse, tokens);
  2040. +
  2041. +   tgsi_proc = parse.FullHeader.Processor.Processor;
  2042. +
  2043. +   switch (tgsi_proc) {
  2044. +   case TGSI_PROCESSOR_VERTEX:
  2045. +       target = TARGET_VS;
  2046. +       break;
  2047. +   case TGSI_PROCESSOR_FRAGMENT:
  2048. +       target = TARGET_PS;
  2049. +       break;
  2050. +   case TGSI_PROCESSOR_GEOMETRY:
  2051. +       target = TARGET_GS;
  2052. +       break;
  2053. +   case TGSI_PROCESSOR_COMPUTE:
  2054. +       target = TARGET_COMPUTE;
  2055. +       break;
  2056. +   default:
  2057. +       assert(!"unexpected shader type");
  2058. +       return NULL;
  2059. +   }
  2060. +
  2061. +   sh = new shader(ctx, target, shader_id, true);
  2062. +   sh->init();
  2063. +   current = sh->root;
  2064. +
  2065. +   if ((r = parse_declarations()))
  2066. +       return NULL;
  2067. +
  2068. +   emit_inputs();
  2069. +
  2070. +   if ((r = parse_instructions()))
  2071. +       return NULL;
  2072. +
  2073. +   tgsi_parse_free(&parse);
  2074. +
  2075. +   emit_exports();
  2076. +
  2077. +   update_pipe_shader();
  2078. +
  2079. +   if (r) {
  2080. +       delete sh;
  2081. +       return NULL;
  2082. +   } else
  2083. +       return sh;
  2084. +}
  2085. +
  2086. +int tgsi_translator::parse_declarations() {
  2087. +   int r;
  2088. +
  2089. +   while (!tgsi_parse_end_of_tokens(&parse)) {
  2090. +       tgsi_parse_token(&parse);
  2091. +       switch (parse.FullToken.Token.Type) {
  2092. +       case TGSI_TOKEN_TYPE_PROPERTY:
  2093. +           r = parse_property();
  2094. +           break;
  2095. +       case TGSI_TOKEN_TYPE_DECLARATION:
  2096. +           r = parse_declaration();
  2097. +           break;
  2098. +       case TGSI_TOKEN_TYPE_IMMEDIATE:
  2099. +           r = parse_immediate();
  2100. +           break;
  2101. +       case TGSI_TOKEN_TYPE_INSTRUCTION:
  2102. +           return 0;
  2103. +           break;
  2104. +       default:
  2105. +           assert(!"unexpected tgsi token type");
  2106. +           return -1;
  2107. +       }
  2108. +       if (r)
  2109. +           return r;
  2110. +   }
  2111. +   return 0;
  2112. +}
  2113. +
  2114. +int tgsi_translator::parse_instructions() {
  2115. +   int r;
  2116. +
  2117. +   while (true) {
  2118. +       switch (parse.FullToken.Token.Type) {
  2119. +       case TGSI_TOKEN_TYPE_INSTRUCTION:
  2120. +           r = parse_instruction();
  2121. +           break;
  2122. +       default:
  2123. +           assert(!"unexpected tgsi token type");
  2124. +           return -1;
  2125. +       }
  2126. +       if (r)
  2127. +           return r;
  2128. +
  2129. +       if (tgsi_parse_end_of_tokens(&parse))
  2130. +           break;
  2131. +       tgsi_parse_token(&parse);
  2132. +   };
  2133. +   return 0;
  2134. +}
  2135. +
  2136. +int tgsi_translator::parse_property() {
  2137. +   tgsi_full_property *property = &parse.FullToken.FullProperty;
  2138. +
  2139. +   switch (property->Property.PropertyName) {
  2140. +   case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
  2141. +       if (property->u[0].Data == 1)
  2142. +           fs_write_all = TRUE;
  2143. +       break;
  2144. +   case TGSI_PROPERTY_VS_PROHIBIT_UCPS:
  2145. +       break;
  2146. +   default:
  2147. +       assert(!"unexpected tgsi property token");
  2148. +       break;
  2149. +   }
  2150. +
  2151. +   return 0;
  2152. +}
  2153. +
  2154. +int tgsi_translator::parse_declaration() {
  2155. +   tgsi_full_declaration *d = &parse.FullToken.FullDeclaration;
  2156. +   unsigned i;
  2157. +
  2158. +   // TODO handle array input/output decls
  2159. +
  2160. +   switch (d->Declaration.File) {
  2161. +   case TGSI_FILE_INPUT:
  2162. +       i = ninput++;
  2163. +       input[i].d.name = d->Semantic.Name;
  2164. +       input[i].d.sid = d->Semantic.Index;
  2165. +       input[i].d.interpolate = d->Interp.Interpolate;
  2166. +       input[i].d.centroid = d->Interp.Centroid;
  2167. +       input[i].tgsi_index = d->Range.First;
  2168. +       if (tgsi_proc == TGSI_PROCESSOR_FRAGMENT) {
  2169. +           if (input[i].d.name != TGSI_SEMANTIC_POSITION &&
  2170. +                   input[i].d.name != TGSI_SEMANTIC_FACE) {
  2171. +               if (input[i].d.interpolate == TGSI_INTERPOLATE_LINEAR)
  2172. +                   interp_mask |= (1 << 1);
  2173. +               else if (input[i].d.interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
  2174. +                   interp_mask |= (1 << 0);
  2175. +               if (input[i].d.centroid)
  2176. +                   interp_mask |= (1 << 2);
  2177. +           }
  2178. +
  2179. +           input[i].d.spi_sid = spi_sid(input[i].d.name, input[i].d.sid);
  2180. +
  2181. +           switch (input[i].d.name) {
  2182. +           case TGSI_SEMANTIC_FACE:
  2183. +               face_input = i;
  2184. +               break;
  2185. +           case TGSI_SEMANTIC_COLOR:
  2186. +               ++colors_used;
  2187. +               break;
  2188. +           case TGSI_SEMANTIC_POSITION:
  2189. +               fragcoord_input = i;
  2190. +               break;
  2191. +           }
  2192. +       }
  2193. +       break;
  2194. +   case TGSI_FILE_OUTPUT:
  2195. +       i = noutput++;
  2196. +       output[i].d.name = d->Semantic.Name;
  2197. +       output[i].d.sid = d->Semantic.Index;
  2198. +       output[i].d.interpolate = d->Interp.Interpolate;
  2199. +       output[i].d.write_mask = d->Declaration.UsageMask;
  2200. +       output[i].tgsi_index = d->Range.First;
  2201. +       if (tgsi_proc == TGSI_PROCESSOR_VERTEX) {
  2202. +           output[i].d.spi_sid = spi_sid(output[i].d.name, output[i].d.sid);
  2203. +           switch (d->Semantic.Name) {
  2204. +           case TGSI_SEMANTIC_CLIPDIST:
  2205. +               clip_dist_write |= d->Declaration.UsageMask
  2206. +                       << (d->Semantic.Index << 2);
  2207. +               break;
  2208. +           case TGSI_SEMANTIC_PSIZE:
  2209. +               vs_out_misc_write = 1;
  2210. +               vs_out_point_size = 1;
  2211. +               break;
  2212. +           case TGSI_SEMANTIC_CLIPVERTEX:
  2213. +               clip_vertex_write = TRUE;
  2214. +               cv_output = i;
  2215. +               break;
  2216. +           }
  2217. +       } else if (tgsi_proc == TGSI_PROCESSOR_FRAGMENT) {
  2218. +           switch (d->Semantic.Name) {
  2219. +           case TGSI_SEMANTIC_COLOR:
  2220. +               nr_ps_max_color_exports++;
  2221. +               break;
  2222. +           }
  2223. +       }
  2224. +       break;
  2225. +
  2226. +   case TGSI_FILE_TEMPORARY:
  2227. +       if (d->Array.ArrayID && d->Range.Last > d->Range.First) {
  2228. +           sh->add_rel_array(VLK_TGSI_TEMP, d->Range.First,
  2229. +               d->Range.Last - d->Range.First + 1, 0xF, d->Array.ArrayID);
  2230. +       }
  2231. +       break;
  2232. +
  2233. +   case TGSI_FILE_CONSTANT:
  2234. +   case TGSI_FILE_SAMPLER:
  2235. +   case TGSI_FILE_ADDRESS:
  2236. +       break;
  2237. +
  2238. +   case TGSI_FILE_SYSTEM_VALUE:
  2239. +       if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
  2240. +           instanceid_index = d->Range.First;
  2241. +           break;
  2242. +       } else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
  2243. +           vertexid_index = d->Range.First;
  2244. +       break;
  2245. +   default:
  2246. +       assert(!"unexpected tgsi declaration");
  2247. +       return -1;
  2248. +   }
  2249. +   return 0;
  2250. +}
  2251. +
  2252. +int tgsi_translator::parse_immediate() {
  2253. +   literals.reserve(literals.size() + 4);
  2254. +   literals.push_back(parse.FullToken.FullImmediate.u[0].Uint);
  2255. +   literals.push_back(parse.FullToken.FullImmediate.u[1].Uint);
  2256. +   literals.push_back(parse.FullToken.FullImmediate.u[2].Uint);
  2257. +   literals.push_back(parse.FullToken.FullImmediate.u[3].Uint);
  2258. +   return 0;
  2259. +}
  2260. +
  2261. +int tgsi_translator::spi_sid(int name, int sid) {
  2262. +   int index;
  2263. +
  2264. +   /* These params are handled differently, they don't need
  2265. +    * semantic indices, so we'll use 0 for them. */
  2266. +   if (name == TGSI_SEMANTIC_POSITION || name == TGSI_SEMANTIC_PSIZE
  2267. +           || name == TGSI_SEMANTIC_FACE)
  2268. +       index = 0;
  2269. +   else {
  2270. +       if (name == TGSI_SEMANTIC_GENERIC) {
  2271. +           /* For generic params simply use sid from tgsi */
  2272. +           index = sid;
  2273. +       } else {
  2274. +           /* For non-generic params - pack name and sid into 8 bits */
  2275. +           index = 0x80 | (name << 3) | sid;
  2276. +       }
  2277. +       /* Make sure that all really used indices have nonzero value, so
  2278. +        * we can just compare it to 0 later instead of comparing the name
  2279. +        * with different values to detect special cases. */
  2280. +       index++;
  2281. +   }
  2282. +   return index;
  2283. +}
  2284. +
  2285. +int tgsi_translator::parse_instruction() {
  2286. +
  2287. +   int i, r;
  2288. +   inst = &parse.FullToken.FullInstruction;
  2289. +   unsigned tgsi_opcode = inst->Instruction.Opcode;
  2290. +
  2291. +   if (tgsi_opcode == TGSI_OPCODE_END)
  2292. +       return 0;
  2293. +
  2294. +   info = &tgsi_info_table[tgsi_opcode];
  2295. +   assert(info->tgsi_op == tgsi_opcode);
  2296. +   assert(info->func);
  2297. +
  2298. +   args = tgsi_args();
  2299. +
  2300. +   if (inst->Instruction.NumDstRegs) {
  2301. +       assert(inst->Instruction.NumDstRegs == 1);
  2302. +       args.dst.dst = true;
  2303. +       args.dst.file = inst->Dst[0].Register.File;
  2304. +       args.dst.kind = file_to_value_kind(args.dst.file);
  2305. +       args.dst.sel = inst->Dst[0].Register.Index;
  2306. +       args.dst.rel = inst->Dst[0].Register.Indirect;
  2307. +
  2308. +       if (args.dst.rel) {
  2309. +           assert(inst->Dst[0].Indirect.File == TGSI_FILE_ADDRESS);
  2310. +           args.dst.rel_addr_index = inst->Dst[0].Indirect.Index;
  2311. +           args.dst.rel_addr_chan = inst->Dst[0].Indirect.Swizzle;
  2312. +           args.dst.rel_array_id = inst->Dst[0].Indirect.ArrayID;
  2313. +           indirect_vlk |= (1 << args.dst.kind);
  2314. +       }
  2315. +   }
  2316. +
  2317. +   write_mask = inst->Dst[0].Register.WriteMask;
  2318. +   clamp = inst->Instruction.Saturate;
  2319. +   args.nsrc = inst->Instruction.NumSrcRegs;
  2320. +
  2321. +   unsigned nconst = 0;
  2322. +   unsigned nliteral = 0;
  2323. +
  2324. +   for (i = 0; i < args.nsrc; ++i) {
  2325. +       tgsi_arg &a = args.src[i];
  2326. +       a.file = inst->Src[i].Register.File;
  2327. +       a.sel = inst->Src[i].Register.Index;
  2328. +
  2329. +       if (a.file == TGSI_FILE_SYSTEM_VALUE) {
  2330. +           if (a.sel == instanceid_index) {
  2331. +               a.kind = VLK_REG;
  2332. +               a.sel = 0;
  2333. +               FILLV4(a.swz, SEL_W);
  2334. +           } else if (a.sel == vertexid_index) {
  2335. +               a.kind = VLK_REG;
  2336. +               a.sel = 0;
  2337. +               FILLV4(a.swz, SEL_X);
  2338. +           } else {
  2339. +               assert(!"unexpected system value");
  2340. +           }
  2341. +       } else if (a.file != TGSI_FILE_SAMPLER) {
  2342. +
  2343. +           a.kind = file_to_value_kind(a.file);
  2344. +           a.rel = inst->Src[i].Register.Indirect;
  2345. +           a.neg = inst->Src[i].Register.Negate;
  2346. +           a.abs = inst->Src[i].Register.Absolute;
  2347. +           a.swz[0] = inst->Src[i].Register.SwizzleX;
  2348. +           a.swz[1] = inst->Src[i].Register.SwizzleY;
  2349. +           a.swz[2] = inst->Src[i].Register.SwizzleZ;
  2350. +           a.swz[3] = inst->Src[i].Register.SwizzleW;
  2351. +
  2352. +           if (a.kind == VLK_KCACHE && inst->Src[i].Register.Dimension)
  2353. +               a.kc_bank = inst->Src[i].Dimension.Index;
  2354. +
  2355. +           if (a.rel) {
  2356. +               assert(inst->Src[i].Indirect.File == TGSI_FILE_ADDRESS);
  2357. +               a.rel_addr_index = inst->Src[i].Indirect.Index;
  2358. +               a.rel_addr_chan = inst->Src[i].Indirect.Swizzle;
  2359. +               a.rel_array_id = inst->Src[i].Indirect.ArrayID;
  2360. +               indirect_vlk |= (1 << a.kind);
  2361. +
  2362. +               if (a.file == TGSI_FILE_CONSTANT) {
  2363. +                   fetch_rel_const(a);
  2364. +               }
  2365. +           } else if (a.file == TGSI_FILE_CONSTANT) {
  2366. +               if (nconst == 1) {
  2367. +                   split_src_arg(a);
  2368. +               } else
  2369. +                   ++nconst;
  2370. +           } else if (a.file == TGSI_FILE_IMMEDIATE) {
  2371. +               if (nliteral == 1) {
  2372. +                   split_src_arg(a);
  2373. +               } else
  2374. +                   ++nliteral;
  2375. +           }
  2376. +       }
  2377. +   }
  2378. +
  2379. +   if ((r = (this->*(info->func))()))
  2380. +       return r;
  2381. +
  2382. +   return 0;
  2383. +}
  2384. +
  2385. +int tgsi_translator::emit_export(shader_io& o, unsigned type, unsigned base,
  2386. +                                 unsigned * swz, unsigned tgsi_index) {
  2387. +   cf_node *e = sh->create_cf(CF_OP_EXPORT);
  2388. +   e->src = get_vector_values(VLK_TGSI_OUTPUT, tgsi_index, swz);
  2389. +   e->bc.array_base = base;
  2390. +   e->bc.type = type;
  2391. +   e->bc.elem_size = 3; // XXX is it required?
  2392. +   memcpy(e->bc.sel, swz, 4 * sizeof(unsigned));
  2393. +   emit_node(e);
  2394. +   return 0;
  2395. +}
  2396. +
  2397. +int tgsi_translator::emit_fake_export(unsigned type) {
  2398. +   cf_node *e = sh->create_cf(CF_OP_EXPORT);
  2399. +   e->bc.sel[0] = 7;
  2400. +   e->bc.sel[1] = 7;
  2401. +   e->bc.sel[2] = 7;
  2402. +   e->bc.sel[3] = 7;
  2403. +   e->src.resize(4);
  2404. +   e->bc.array_base = type == EXP_POS ? 60 : 0;
  2405. +   e->bc.type = type;
  2406. +   e->bc.elem_size = 3;
  2407. +   emit_node(e);
  2408. +   return 0;
  2409. +}
  2410. +
  2411. +int tgsi_translator::emit_exports() {
  2412. +   int i, j, k, n;
  2413. +   int next_pos = 60, next_pixel = 0, next_param = 0;
  2414. +
  2415. +   if (clip_vertex_write) {
  2416. +       int cd = noutput;
  2417. +
  2418. +       noutput += 2;
  2419. +       output[cd].d.name = TGSI_SEMANTIC_CLIPDIST;
  2420. +       output[cd].tgsi_index = cd;
  2421. +       output[cd + 1].d.name = TGSI_SEMANTIC_CLIPDIST;
  2422. +       output[cd + 1].tgsi_index = cd + 1;
  2423. +
  2424. +       output[cv_output].d.spi_sid = 0;
  2425. +       clip_dist_write = 0xFF;
  2426. +
  2427. +       for (i = 0; i < 8; i++) {
  2428. +           int oreg = i >> 2, ochan = i & 3;
  2429. +           value *o = get_tgsi_value(VLK_TGSI_OUTPUT, cd + oreg, ochan);
  2430. +           alu_packed_node *p = sh->create_alu_packed();
  2431. +
  2432. +           for (j = 0; j < 4; j++) {
  2433. +               value *cvo = get_tgsi_value(VLK_TGSI_OUTPUT, cv_output, j);
  2434. +               value *cp = sh->get_kcache_value(R600_UCP_CONST_BUFFER, i, j);
  2435. +               alu_node *a = build_alu(ALU_OP2_DOT4, j == ochan ? o : NULL, 0,
  2436. +                       asrc(cvo), asrc(cp));
  2437. +               a->bc.slot = j;
  2438. +               p->push_back(a);
  2439. +           }
  2440. +           emit_node(p);
  2441. +       }
  2442. +   }
  2443. +
  2444. +   pipe_stream_output_info &so = ps->selector->so;
  2445. +   for (i = 0; i < (int)so.num_outputs; i++) {
  2446. +       int nc = so.output[i].num_components;
  2447. +       unsigned start_comp = so.output[i].start_component, real_start;
  2448. +       unsigned index = so.output[i].register_index;
  2449. +       unsigned dst_offset = so.output[i].dst_offset;
  2450. +       unsigned buf = so.output[i].output_buffer;
  2451. +       unsigned op = so.output[i].output_buffer;
  2452. +
  2453. +       assert(buf < 4);
  2454. +       op += ctx.is_egcm() ? CF_OP_MEM_STREAM0_BUF0 : CF_OP_MEM_STREAM0;
  2455. +       real_start = (dst_offset < start_comp) ? 0 : start_comp;
  2456. +
  2457. +       cf_node *ms = sh->create_cf(op);
  2458. +       ms->bc.elem_size = nc;
  2459. +       ms->bc.array_base = dst_offset - real_start;
  2460. +       ms->bc.type = MEM_WRITE;
  2461. +       ms->bc.array_size = 0xFFF;
  2462. +       ms->src.resize(4);
  2463. +
  2464. +       for (j = 0; j < nc; ++j) {
  2465. +           value *v = get_tgsi_value(VLK_TGSI_OUTPUT, index, start_comp + j);
  2466. +           ms->src[real_start + j] = v;
  2467. +       }
  2468. +       emit_node(ms);
  2469. +   }
  2470. +
  2471. +   for (i = 0; i < noutput; ++i) {
  2472. +       shader_io &o = output[i];
  2473. +       unsigned ti = o.tgsi_index;
  2474. +       unsigned swz[4] = { 0, 1, 2, 3 };
  2475. +
  2476. +       switch (sh->target) {
  2477. +       case TARGET_VS:
  2478. +           switch (o.d.name) {
  2479. +           case TGSI_SEMANTIC_CLIPDIST:
  2480. +               if (o.d.spi_sid)
  2481. +                   emit_export(o, EXP_PARAM, next_param++, swz, ti);
  2482. +               /* fall through */
  2483. +           case TGSI_SEMANTIC_POSITION:
  2484. +           case TGSI_SEMANTIC_PSIZE:
  2485. +               emit_export(o, EXP_POS, next_pos++, swz, ti);
  2486. +               break;
  2487. +           case TGSI_SEMANTIC_FOG:
  2488. +               swz[1] = 4;
  2489. +               swz[2] = 4;
  2490. +               swz[3] = 5; /* x001 */
  2491. +               emit_export(o, EXP_PARAM, next_param++, swz, ti);
  2492. +               break;
  2493. +           case TGSI_SEMANTIC_CLIPVERTEX:
  2494. +               break;
  2495. +           default:
  2496. +               emit_export(o, EXP_PARAM, next_param++, swz, ti);
  2497. +           }
  2498. +           break;
  2499. +       case TARGET_PS:
  2500. +           if (fs_write_all && ctx.is_egcm())
  2501. +               nr_ps_max_color_exports = 8;
  2502. +
  2503. +           switch (o.d.name) {
  2504. +           case TGSI_SEMANTIC_COLOR:
  2505. +               if (next_pixel && next_pixel >= key.nr_cbufs)
  2506. +                   continue;
  2507. +               swz[3] = key.alpha_to_one ? 5 : 3;
  2508. +               n = (fs_write_all && ctx.is_egcm() && key.nr_cbufs) ?
  2509. +                       key.nr_cbufs : 1;
  2510. +               for (k = 0; k < n; k++) {
  2511. +                   emit_export(o, EXP_PIXEL, next_pixel++, swz, ti);
  2512. +               }
  2513. +               nr_ps_color_exports += n;
  2514. +               break;
  2515. +           case TGSI_SEMANTIC_POSITION:
  2516. +               swz[0] = 2;
  2517. +               swz[1] = 7;
  2518. +               swz[2] = 7;
  2519. +               swz[3] = 7;
  2520. +               emit_export(o, EXP_PIXEL, 61, swz, ti);
  2521. +               break;
  2522. +           case TGSI_SEMANTIC_STENCIL:
  2523. +               swz[0] = 7;
  2524. +               swz[1] = 1;
  2525. +               swz[2] = 7;
  2526. +               swz[3] = 7;
  2527. +               emit_export(o, EXP_PIXEL, 61, swz, ti);
  2528. +               break;
  2529. +           default:
  2530. +               assert(!"unexpected ps output");
  2531. +           }
  2532. +           break;
  2533. +       default:
  2534. +           assert(!"unexpected shader target");
  2535. +           break;
  2536. +       }
  2537. +   }
  2538. +
  2539. +   if (sh->target == TARGET_VS) {
  2540. +       if (next_pos == 60)
  2541. +           emit_fake_export(EXP_POS);
  2542. +       if (next_param == 0)
  2543. +           emit_fake_export(EXP_PARAM);
  2544. +   } else if (sh->target == TARGET_PS && next_pixel == 0)
  2545. +       emit_fake_export(EXP_PIXEL);
  2546. +
  2547. +   return 0;
  2548. +}
  2549. +
  2550. +int tgsi_translator::ti_unsupported() {
  2551. +   sblog << "sb tgsi: unsupported tgsi op " << info->name << "\n";
  2552. +   return -1;
  2553. +}
  2554. +
  2555. +inline void tgsi_translator::emit_node(node* n) {
  2556. +   current->push_back(n);
  2557. +}
  2558. +
  2559. +// Groups at this stage are used only to represent parallel execution in some
  2560. +// cases until we convert the code to SSA form, they have nothing to do with
  2561. +// VLIW alu groups that will be created later by post_scheduler pass.
  2562. +// E.g., when translating "MOV TEMP[0].xy, TEMP[0].yx", we can put two ISA MOVs
  2563. +// into group to tell the backend that they should be executed in parallel,
  2564. +// otherwise we would need additional temp value and 3 MOVs to perform swap.
  2565. +// Groups are taken into account by ssa construction - all source operands
  2566. +// of grouped operations use the versions that existed before the group.
  2567. +inline void tgsi_translator::begin_group() {
  2568. +   alu_group_node *g = sh->create_alu_group();
  2569. +   current->push_back(g);
  2570. +   current = g;
  2571. +}
  2572. +
  2573. +inline void tgsi_translator::end_group() {
  2574. +   assert(current->is_alu_group());
  2575. +   current = current->parent;
  2576. +}
  2577. +
  2578. +vvec tgsi_translator::get_vector_values(value_kind kind, unsigned tgsi_index,
  2579. +                                        unsigned* swz) {
  2580. +   vvec vv;
  2581. +   unsigned i;
  2582. +
  2583. +   vv.resize(4);
  2584. +   for (i = 0; i < 4; ++i) {
  2585. +       unsigned chan = swz ? swz[i] : i;
  2586. +       if (chan < 4)
  2587. +           vv[i] = get_tgsi_value(kind, tgsi_index, chan);
  2588. +   }
  2589. +   return vv;
  2590. +}
  2591. +
  2592. +int tgsi_translator::emit_inputs() {
  2593. +   int i, nparam = 0, gpr_reserved = 0;
  2594. +
  2595. +   // XXX temporary workaround for lack of proper array support for inputs
  2596. +   if (ninput)
  2597. +       sh->add_rel_array(VLK_TGSI_INPUT, 0, ninput, 0xF, 0);
  2598. +
  2599. +   switch (sh->target) {
  2600. +   case TARGET_VS: {
  2601. +       cf_node *c = sh->create_cf(CF_OP_CALL_FS);
  2602. +
  2603. +       c->flags |= NF_SCHEDULE_EARLY | NF_DONT_MOVE;
  2604. +       sh->add_pinned_gpr_values(c->src, 0, 0xF, true);
  2605. +       sh->add_input(0, true, 0xF);
  2606. +
  2607. +       // pin input arrays
  2608. +       for (i = 0; i < 4; ++i) {
  2609. +           rel_array *a = sh->get_rel_array(VLK_TGSI_INPUT, 0, i);
  2610. +           if (a)
  2611. +               a->gpr = sel_chan(1, i);
  2612. +       }
  2613. +
  2614. +       for (i = 0; i < ninput; ++i) {
  2615. +           shader_io &in = input[i];
  2616. +           vvec dv = get_vector_values(VLK_TGSI_INPUT, in.tgsi_index);
  2617. +           c->dst.insert(c->dst.end(), dv.begin(), dv.end());
  2618. +       }
  2619. +       emit_node(c);
  2620. +       break;
  2621. +   }
  2622. +   case TARGET_PS:
  2623. +       if (ctx.is_egcm()) {
  2624. +           if (!interp_mask)
  2625. +               interp_mask = 1;
  2626. +
  2627. +           unsigned ij_pairs = ((interp_mask & 1) + ((interp_mask >> 1) & 1))
  2628. +                   * ((interp_mask & 4) ? 2 : 1);
  2629. +
  2630. +           unsigned mask = (1u << 2 * ij_pairs) - 1;
  2631. +           unsigned gpr = 0;
  2632. +
  2633. +           while (mask) {
  2634. +               sh->add_input(gpr, true, mask & 0x0F);
  2635. +               ++gpr;
  2636. +               mask >>= 4;
  2637. +           }
  2638. +           gpr_reserved = gpr;
  2639. +       }
  2640. +
  2641. +       // pin input arrays
  2642. +       for (i = 0; i < 4; ++i) {
  2643. +           rel_array *a = sh->get_rel_array(VLK_TGSI_INPUT, 0, i);
  2644. +           if (a)
  2645. +               a->gpr = sel_chan(gpr_reserved, i);
  2646. +       }
  2647. +
  2648. +       if (key.color_two_side && colors_used) {
  2649. +           two_side = 1;
  2650. +
  2651. +           if (face_input == -1) {
  2652. +               i = ninput++;
  2653. +               input[i].d.name = TGSI_SEMANTIC_FACE;
  2654. +               input[i].d.spi_sid = 0;
  2655. +               input[i].tgsi_index = i;
  2656. +               face_input = i;
  2657. +           }
  2658. +       }
  2659. +
  2660. +       for (i = 0; i < ninput; ++i) {
  2661. +           shader_io &in = input[i];
  2662. +           in.d.gpr = gpr_reserved++;
  2663. +
  2664. +           if (ctx.is_egcm() && in.d.spi_sid) {
  2665. +               in.d.lds_pos = nparam++;
  2666. +               if (in.d.interpolate != TGSI_INTERPOLATE_CONSTANT) {
  2667. +                   in.d.ij_index = get_ij(in);
  2668. +
  2669. +                   emit_node(build_interp(in, 1));
  2670. +                   emit_node(build_interp(in, 0));
  2671. +               } else {
  2672. +                   emit_node(build_interp_flat(in));
  2673. +               }
  2674. +           } else {
  2675. +               sh->add_pinned_inputs(sh->root->dst, VLK_TGSI_INPUT,
  2676. +                   in.tgsi_index, 0xF, false, in.d.gpr);
  2677. +           }
  2678. +
  2679. +           if (two_side) {
  2680. +               if (in.d.name == TGSI_SEMANTIC_COLOR) {
  2681. +                   int ni = ninput++;
  2682. +                   shader_io &nin = input[ni];
  2683. +                   nin = in;
  2684. +                   nin.d.name = TGSI_SEMANTIC_BCOLOR;
  2685. +                   nin.d.spi_sid = spi_sid(nin.d.name, nin.d.sid);
  2686. +                   // back_color_input actually means front_color_input here
  2687. +                   nin.d.back_color_input = i;
  2688. +                   nin.tgsi_index = ni;
  2689. +               } else if (in.d.name == TGSI_SEMANTIC_BCOLOR) {
  2690. +                   // both inputs are interpolated now, so select the color
  2691. +                   int k;
  2692. +                   shader_io &fin = input[in.d.back_color_input];
  2693. +
  2694. +                   for (k = 0; k < 4; ++k) {
  2695. +                       value *face = sh->get_value(VLK_TGSI_INPUT,
  2696. +                           sel_chan(input[face_input].tgsi_index, 0));
  2697. +                       value *fv = sh->get_value(VLK_TGSI_INPUT,
  2698. +                           sel_chan(fin.tgsi_index, k));
  2699. +                       value *bv = sh->get_value(VLK_TGSI_INPUT,
  2700. +                           sel_chan(in.tgsi_index, k));
  2701. +                       emit_alu(ALU_OP3_CNDGT, fv, 0, asrc(face), asrc(fv),
  2702. +                                asrc(bv));
  2703. +                   }
  2704. +               }
  2705. +           }
  2706. +       }
  2707. +
  2708. +       if (fragcoord_input != -1) {
  2709. +           value* w = get_tgsi_value(VLK_TGSI_INPUT, fragcoord_input, SEL_W);
  2710. +           emit_alu(ALU_OP1_RECIP_IEEE, w, 0, asrc(w));
  2711. +       }
  2712. +
  2713. +
  2714. +       break;
  2715. +   default:
  2716. +       assert(!"unexpected target");
  2717. +   }
  2718. +   return 0;
  2719. +}
  2720. +
  2721. +alu_packed_node* tgsi_translator::build_interp(shader_io& in, unsigned type) {
  2722. +   alu_packed_node *p = sh->create_alu_packed();
  2723. +   unsigned op = type == 0 ? ALU_OP2_INTERP_XY : ALU_OP2_INTERP_ZW;
  2724. +   unsigned i, gpr, base_chan;
  2725. +   value *v;
  2726. +
  2727. +   gpr = in.d.ij_index >> 1;
  2728. +   base_chan = ((in.d.ij_index & 1) << 1) + 1;
  2729. +   for (i = 0; i < 4; ++i) {
  2730. +       alu_node *a = create_alu(op);
  2731. +       if ((i >> 1) == type)
  2732. +           v = sh->get_value(VLK_TGSI_INPUT, sel_chan(in.tgsi_index, i));
  2733. +       else
  2734. +           v = NULL;
  2735. +       a->dst.push_back(v);
  2736. +       v = sh->get_gpr_value(true, gpr, base_chan - (i & 1), false);
  2737. +       a->src.push_back(v);
  2738. +       v = sh->get_special_ro_value(
  2739. +           sel_chan(ALU_SRC_PARAM_OFFSET + in.d.lds_pos, i));
  2740. +       a->src.push_back(v);
  2741. +       a->bc.slot = i;
  2742. +       p->push_back(a);
  2743. +   }
  2744. +   return p;
  2745. +}
  2746. +
  2747. +alu_group_node* tgsi_translator::build_interp_flat(shader_io& in) {
  2748. +   alu_group_node *g = sh->create_alu_group();
  2749. +   value *v;
  2750. +
  2751. +   for (unsigned i = 0; i < 4; ++i) {
  2752. +       alu_node *a = create_alu(ALU_OP1_INTERP_LOAD_P0);
  2753. +       v = sh->get_value(VLK_TGSI_INPUT, sel_chan(in.tgsi_index, i));
  2754. +       a->dst.push_back(v);
  2755. +       v = sh->get_special_ro_value(
  2756. +           sel_chan(ALU_SRC_PARAM_OFFSET + in.d.lds_pos, i));
  2757. +       a->src.push_back(v);
  2758. +       a->bc.slot = i;
  2759. +       g->push_back(a);
  2760. +   }
  2761. +   return g;
  2762. +}
  2763. +
  2764. +inline int tgsi_translator::get_ij(shader_io& in) {
  2765. +   int ij = 0;
  2766. +   if (in.d.interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
  2767. +       return in.d.centroid ? 1 : 0;
  2768. +   else if (in.d.interpolate == TGSI_INTERPOLATE_LINEAR)
  2769. +       return (interp_mask & 1) + ((interp_mask >> 2) & 1) +
  2770. +               (in.d.centroid ? 1 : 0);
  2771. +   return ij;
  2772. +}
  2773. +
  2774. +#define FOREACH_CHAN_UNMASKED for (unsigned ch = 0; ch < 4; ++ch)
  2775. +#define FOREACH_CHAN FOREACH_CHAN_UNMASKED if (write_mask & (1 << ch))
  2776. +
  2777. +value* tgsi_translator::get_arg_value(tgsi_arg &ta, unsigned chan) {
  2778. +   if (ta.values.empty())
  2779. +       ta.values.resize(4);
  2780. +
  2781. +   if (!ta.values[chan]) {
  2782. +       unsigned schan = ta.dst ? chan : ta.swz[chan];
  2783. +       if (ta.rel) {
  2784. +           value *r = get_tgsi_value(VLK_TGSI_ADDR, ta.rel_addr_index,
  2785. +               ta.rel_addr_chan);
  2786. +           ta.values[chan] = sh->get_reg_value(ta.kind, !ta.dst, ta.sel, schan,
  2787. +               ta.rel, r, ta.rel_array_id);
  2788. +       } else
  2789. +           ta.values[chan] = get_tgsi_value(ta.kind, ta.sel, schan);
  2790. +   }
  2791. +   return ta.values[chan];
  2792. +}
  2793. +
  2794. +value* tgsi_translator::get_arg_value(unsigned index, unsigned chan) {
  2795. +   tgsi_arg &tv = index ? args.src[index - 1] : args.dst;
  2796. +   return get_arg_value(tv, chan);
  2797. +}
  2798. +
  2799. +int tgsi_translator::ti_alu() {
  2800. +   switch (info->tgsi_op) {
  2801. +   case TGSI_OPCODE_SUB:
  2802. +       args.src[1].neg = !args.src[1].neg;
  2803. +       break;
  2804. +   case TGSI_OPCODE_ABS:
  2805. +       args.src[0].neg = 0;
  2806. +       args.src[0].abs = 1;
  2807. +       break;
  2808. +   }
  2809. +
  2810. +   begin_group();
  2811. +   if (unlikely(info->flags & TIF_ALU_SWAPSRC01)) {
  2812. +       FOREACH_CHAN
  2813. +       {
  2814. +           emit_alu(info->isa_op, tgsi_dst(ch), clamp, asrc(args.src[1], ch),
  2815. +               asrc(args.src[0], ch));
  2816. +       }
  2817. +   } else {
  2818. +       FOREACH_CHAN
  2819. +       {
  2820. +           emit_alu(info->isa_op, ch);
  2821. +       }
  2822. +   }
  2823. +   end_group();
  2824. +   return 0;
  2825. +}
  2826. +
  2827. +int tgsi_translator::ti_dot() {
  2828. +
  2829. +   unsigned nc, s1 = 0, i;
  2830. +   switch (info->tgsi_op) {
  2831. +   case TGSI_OPCODE_DP2:
  2832. +       nc = 2;
  2833. +       break;
  2834. +   case TGSI_OPCODE_DP3:
  2835. +       nc = 3;
  2836. +       break;
  2837. +   case TGSI_OPCODE_DP4:
  2838. +       nc = 4;
  2839. +       break;
  2840. +   case TGSI_OPCODE_DPH:
  2841. +       nc = 4;
  2842. +       s1 = 1;
  2843. +       break;
  2844. +   default:
  2845. +       nc = 0;
  2846. +       assert(!"ti_dot: unexpected tgsi opcode");
  2847. +   }
  2848. +
  2849. +   unsigned ch = __builtin_ctz(write_mask);
  2850. +   unsigned nwc = __builtin_popcount(write_mask);
  2851. +
  2852. +   value *t = nwc > 1 ? create_temp() : tgsi_dst(ch);
  2853. +
  2854. +   alu_packed_node *p = sh->create_alu_packed();
  2855. +   alu_node *a;
  2856. +   for (i = 0; i < nc - s1; ++i) {
  2857. +       a = build_alu(ALU_OP2_DOT4, (i == ch) ? t : NULL, clamp,
  2858. +               asrc(args.src[0], i), asrc(args.src[1], i));
  2859. +       a->bc.slot = i;
  2860. +       p->push_back(a);
  2861. +   }
  2862. +   if (s1) {
  2863. +       a = build_alu(ALU_OP2_DOT4, (i == ch) ? t : NULL, clamp,
  2864. +               asrc(literal(1.0f)), asrc(args.src[1], i));
  2865. +       a->bc.slot = i++;
  2866. +       p->push_back(a);
  2867. +   }
  2868. +   for (; i < 4; ++i) {
  2869. +       a = build_alu(ALU_OP2_DOT4, (i == ch) ? t : NULL, clamp,
  2870. +               asrc(literal(0)), asrc(literal(0)));
  2871. +       a->bc.slot = i;
  2872. +       p->push_back(a);
  2873. +   }
  2874. +   emit_node(p);
  2875. +
  2876. +   if (nwc > 1)
  2877. +       ti_replicate(t);
  2878. +
  2879. +   return 0;
  2880. +}
  2881. +
  2882. +int tgsi_translator::ti_repl() {
  2883. +   switch (info->tgsi_op) {
  2884. +   case TGSI_OPCODE_RSQ:
  2885. +       args.src[0].abs = 1;
  2886. +       args.src[0].neg = 0;
  2887. +       break;
  2888. +   }
  2889. +
  2890. +   value *t = create_temp();
  2891. +   emit_alu(info->isa_op, t, clamp, asrc(args.src[0], SEL_X));
  2892. +   ti_replicate(t);
  2893. +   return 0;
  2894. +}
  2895. +
  2896. +int tgsi_translator::emit_alu(unsigned op, int chan, int dstchan) {
  2897. +   unsigned slots = ctx.alu_slots(op);
  2898. +   int expand = 0, i;
  2899. +
  2900. +   dstchan = (dstchan == -1) ? chan : dstchan;
  2901. +
  2902. +   assert(slots);
  2903. +   if (ctx.is_cayman() && (slots == AF_S || (slots & AF_CM_EXPAND)))
  2904. +       expand = 4;
  2905. +
  2906. +   if (expand) {
  2907. +       alu_packed_node *p = sh->create_alu_packed();
  2908. +       for (i = 0; i < expand; ++i) {
  2909. +           alu_node *a = build_alu(op, chan, i);
  2910. +           if (i != dstchan)
  2911. +               a->dst[0] = NULL;
  2912. +           a->bc.slot = i;
  2913. +           p->push_back(a);
  2914. +       }
  2915. +       emit_node(p);
  2916. +   } else {
  2917. +       alu_node *a = build_alu(op, chan, dstchan);
  2918. +       emit_node(a);
  2919. +   }
  2920. +   return 0;
  2921. +}
  2922. +
  2923. +alu_node* tgsi_translator::build_alu(unsigned op, int chan, int dstchan) {
  2924. +   alu_node *a = create_alu(op);
  2925. +   unsigned i, nsrc = a->bc.op_ptr->src_count;
  2926. +
  2927. +   dstchan = (dstchan == -1) ? chan : dstchan;
  2928. +   a->dst.push_back(tgsi_dst(dstchan));
  2929. +   a->bc.clamp = clamp;
  2930. +
  2931. +   for (i = 0; i < nsrc; ++i) {
  2932. +       a->src.push_back(get_arg_value(1 + i, chan));
  2933. +       a->bc.src[i].neg = args.src[i].neg;
  2934. +       a->bc.src[i].abs = args.src[i].abs;
  2935. +   }
  2936. +   return a;
  2937. +}
  2938. +
  2939. +alu_node* tgsi_translator::create_alu(unsigned op) {
  2940. +   alu_node *a = sh->create_alu();
  2941. +   a->bc.set_op(op);
  2942. +   a->bc.slot_flags = (alu_op_flags) ctx.alu_slots(a->bc.op_ptr);
  2943. +   if (a->bc.op_ptr->flags & AF_KILL) {
  2944. +       a->flags |= NF_DONT_HOIST | NF_DONT_MOVE | NF_DONT_KILL
  2945. +               | NF_SCHEDULE_EARLY;
  2946. +   } else if (a->bc.op_ptr->flags & (AF_PRED | AF_MOVA)) {
  2947. +       a->flags |= NF_DONT_HOIST;
  2948. +   }
  2949. +   return a;
  2950. +}
  2951. +
  2952. +int tgsi_translator::ti_trig() {
  2953. +   value *t = prepare_trig(asrc(args.src[0], 0));
  2954. +   emit_alu(info->isa_op, t, clamp, asrc(t));
  2955. +   ti_replicate(t);
  2956. +   return 0;
  2957. +}
  2958. +
  2959. +int tgsi_translator::ti_scs() {
  2960. +   value *t = prepare_trig(asrc(args.src[0], 0));
  2961. +
  2962. +   begin_group();
  2963. +   if (write_mask & (1 << SEL_X))
  2964. +       emit_alu(ALU_OP1_COS, tgsi_dst(SEL_X), clamp, asrc(t));
  2965. +   if (write_mask & (1 << SEL_Y))
  2966. +       emit_alu(ALU_OP1_SIN, tgsi_dst(SEL_Y), clamp, asrc(t));
  2967. +   if (write_mask & (1 << SEL_Z))
  2968. +       emit_alu(ALU_OP1_MOV, tgsi_dst(SEL_Z), 0, asrc(0.0f));
  2969. +   if (write_mask & (1 << SEL_W))
  2970. +       emit_alu(ALU_OP1_MOV, tgsi_dst(SEL_W), 0, asrc(1.0f));
  2971. +   end_group();
  2972. +   return 0;
  2973. +}
  2974. +
  2975. +value* tgsi_translator::prepare_trig(alu_src s) {
  2976. +   static float half_inv_pi = 1.0 / (3.1415926535 * 2);
  2977. +   static float double_pi = 3.1415926535 * 2;
  2978. +   static float neg_pi = -3.1415926535;
  2979. +
  2980. +   value *t = create_temp();
  2981. +
  2982. +   emit_alu(ALU_OP3_MULADD, t, 0, s, asrc(half_inv_pi), asrc(0.5f));
  2983. +   emit_alu(ALU_OP1_FRACT, t, 0, asrc(t));
  2984. +
  2985. +   if (ctx.is_r600())
  2986. +       emit_alu(ALU_OP3_MULADD, t, 0, asrc(t), asrc(double_pi), asrc(neg_pi));
  2987. +   else
  2988. +#if 0
  2989. +       emit_alu(ALU_OP2_ADD, t, 0, asrc(t), asrc(-0.5f));
  2990. +#else
  2991. +       // using muladd just to reduce differences from default backend for
  2992. +       // debugging
  2993. +       emit_alu(ALU_OP3_MULADD, t, 0, asrc(t), asrc(1.0f), asrc(0.5f, 0, 1));
  2994. +#endif
  2995. +
  2996. +   return t;
  2997. +}
  2998. +
  2999. +int tgsi_translator::ti_exp() {
  3000. +   value* t = create_temp();
  3001. +
  3002. +   if (write_mask & (1 << SEL_X))
  3003. +       emit_alu(ALU_OP1_FLOOR, t, 0, asrc(args.src[0], 0));
  3004. +
  3005. +   begin_group();
  3006. +   if (write_mask & (1 << SEL_X))
  3007. +       emit_alu(ALU_OP1_EXP_IEEE, tgsi_dst(SEL_X), clamp, asrc(t));
  3008. +   if (write_mask & (1 << SEL_Y))
  3009. +       emit_alu(ALU_OP1_FRACT, tgsi_dst(SEL_Y), clamp, asrc(args.src[0], 0));
  3010. +   if (write_mask & (1 << SEL_Z))
  3011. +       emit_alu(ALU_OP1_EXP_IEEE, tgsi_dst(SEL_Z), clamp,
  3012. +           asrc(args.src[0], 0));
  3013. +   if (write_mask & (1 << SEL_W))
  3014. +       emit_alu(ALU_OP1_MOV, tgsi_dst(SEL_W), 0, asrc(1.0f));
  3015. +   end_group();
  3016. +   return 0;
  3017. +}
  3018. +
  3019. +int tgsi_translator::ti_log() {
  3020. +   value *t = create_temp();
  3021. +   value *t2 = create_temp();
  3022. +   value *t3 = create_temp();
  3023. +
  3024. +   alu_src s = asrc(args.src[0], 0, 1, 0);
  3025. +
  3026. +   if (write_mask & 0x7)
  3027. +       emit_alu(ALU_OP1_LOG_IEEE, t, 0, s);
  3028. +   if (write_mask & 0x3)
  3029. +       emit_alu(ALU_OP1_FLOOR, t2, 0, asrc(t));
  3030. +   if (write_mask & (1 << SEL_Y)) {
  3031. +       emit_alu(ALU_OP1_EXP_IEEE, t3, 0, asrc(t2));
  3032. +       emit_alu(ALU_OP1_RECIP_IEEE, t3, 0, asrc(t3));
  3033. +   }
  3034. +
  3035. +   begin_group();
  3036. +   if (write_mask & (1 << SEL_X))
  3037. +       emit_alu(ALU_OP1_MOV, tgsi_dst(SEL_X), clamp, asrc(t2));
  3038. +   if (write_mask & (1 << SEL_Y))
  3039. +       emit_alu(ALU_OP2_MUL, tgsi_dst(SEL_Y), clamp, s, asrc(t3));
  3040. +   if (write_mask & (1 << SEL_Z))
  3041. +       emit_alu(ALU_OP1_MOV, tgsi_dst(SEL_Z), clamp, asrc(t));
  3042. +   if (write_mask & (1 << SEL_W))
  3043. +       emit_alu(ALU_OP1_MOV, tgsi_dst(SEL_W), 0, asrc(1.0f));
  3044. +   end_group();
  3045. +   return 0;
  3046. +}
  3047. +
  3048. +int tgsi_translator::ti_dst() {
  3049. +   begin_group();
  3050. +   if (write_mask & (1 << SEL_X))
  3051. +       emit_alu(ALU_OP1_MOV, tgsi_dst(SEL_X), 0, asrc(1.0f));
  3052. +   if (write_mask & (1 << SEL_Y))
  3053. +       emit_alu(ALU_OP2_MUL, tgsi_dst(SEL_Y), clamp, asrc(args.src[0], SEL_Y),
  3054. +           asrc(args.src[1], SEL_Y));
  3055. +   if (write_mask & (1 << SEL_Z))
  3056. +       emit_alu(ALU_OP1_MOV, tgsi_dst(SEL_Z), clamp, asrc(args.src[0], SEL_Z));
  3057. +   if (write_mask & (1 << SEL_W))
  3058. +       emit_alu(ALU_OP1_MOV, tgsi_dst(SEL_W), clamp, asrc(args.src[1], SEL_W));
  3059. +   end_group();
  3060. +   return 0;
  3061. +}
  3062. +
  3063. +int tgsi_translator::ti_lrp() {
  3064. +   vvec t;
  3065. +   create_temps(t, 4);
  3066. +
  3067. +   FOREACH_CHAN
  3068. +   {
  3069. +       emit_alu(ALU_OP2_ADD, t[ch], 0, asrc(1.0f),
  3070. +           asrc(args.src[0], ch, 0, 1));
  3071. +       emit_alu(ALU_OP2_MUL, t[ch], 0, asrc(t[ch]), asrc(args.src[2], ch));
  3072. +   }
  3073. +   begin_group();
  3074. +   FOREACH_CHAN
  3075. +   {
  3076. +       emit_alu(ALU_OP3_MULADD, tgsi_dst(ch), clamp, asrc(args.src[0], ch),
  3077. +           asrc(args.src[1], ch), asrc(t[ch]));
  3078. +   }
  3079. +   end_group();
  3080. +   return 0;
  3081. +}
  3082. +
  3083. +int tgsi_translator::ti_pow() {
  3084. +   value* t = create_temp();
  3085. +
  3086. +   emit_alu(ALU_OP1_LOG_IEEE, t, 0, asrc(args.src[0], 0));
  3087. +   emit_alu(ALU_OP2_MUL, t, 0, asrc(args.src[1], 0), asrc(t));
  3088. +   emit_alu(ALU_OP1_EXP_IEEE, t, clamp, asrc(t));
  3089. +   ti_replicate(t);
  3090. +   return 0;
  3091. +}
  3092. +
  3093. +int tgsi_translator::ti_replicate(value* t) {
  3094. +   begin_group();
  3095. +   FOREACH_CHAN
  3096. +   {
  3097. +       emit_alu(ALU_OP1_MOV, tgsi_dst(ch), 0, asrc(t));
  3098. +   }
  3099. +   end_group();
  3100. +   return 0;
  3101. +}
  3102. +
  3103. +int tgsi_translator::ti_xpd() {
  3104. +   static const unsigned int src0_swizzle[] = { 2, 0, 1 };
  3105. +   static const unsigned int src1_swizzle[] = { 1, 2, 0 };
  3106. +   vvec t;
  3107. +
  3108. +   create_temps(t, 3);
  3109. +   FOREACH_CHAN
  3110. +   {
  3111. +       if (ch < SEL_W)
  3112. +           emit_alu(ALU_OP2_MUL, t[ch], 0, asrc(args.src[0], src0_swizzle[ch]),
  3113. +               asrc(args.src[1], src1_swizzle[ch]));
  3114. +   }
  3115. +   begin_group();
  3116. +   FOREACH_CHAN
  3117. +   {
  3118. +       if (ch < SEL_W)
  3119. +           emit_alu(ALU_OP3_MULADD, tgsi_dst(ch), clamp,
  3120. +               asrc(args.src[0], src1_swizzle[ch]),
  3121. +               asrc(args.src[1], src0_swizzle[ch]), asrc(t[ch], 0, 1));
  3122. +       else
  3123. +           emit_alu(ALU_OP1_MOV, tgsi_dst(ch), 0, asrc(1.0f));
  3124. +   }
  3125. +   end_group();
  3126. +   return 0;
  3127. +}
  3128. +
  3129. +int tgsi_translator::ti_kill() {
  3130. +   int i;
  3131. +
  3132. +   // XXX if this affects performance, we might want to do it after DCE
  3133. +   uses_kill = true;
  3134. +
  3135. +   for (i = 0; i < 4; ++i) {
  3136. +       if (info->tgsi_op == TGSI_OPCODE_KILL_IF)
  3137. +           emit_alu(ALU_OP2_KILLGT, NULL, 0, asrc(0.0f), asrc(args.src[0], i));
  3138. +       else
  3139. +           emit_alu(ALU_OP2_KILLGT, NULL, 0, asrc(1.0f), asrc(0.0f));
  3140. +   }
  3141. +   return 0;
  3142. +}
  3143. +
  3144. +int tgsi_translator::ti_arl() {
  3145. +   switch (info->tgsi_op) {
  3146. +   case TGSI_OPCODE_ARL:
  3147. +       if (ctx.is_egcm()) {
  3148. +           emit_alu(ALU_OP1_FLT_TO_INT_FLOOR, tgsi_dst(SEL_X), 0,
  3149. +               asrc(args.src[0], 0));
  3150. +       } else {
  3151. +           value *t = create_temp();
  3152. +           emit_alu(ALU_OP1_FLOOR, t, 0, asrc(args.src[0], 0));
  3153. +           emit_alu(ALU_OP1_FLT_TO_INT, tgsi_dst(SEL_X), 0, asrc(t));
  3154. +       }
  3155. +       break;
  3156. +   case TGSI_OPCODE_ARR:
  3157. +       emit_alu(ALU_OP1_FLT_TO_INT, tgsi_dst(SEL_X), 0, asrc(args.src[0], 0));
  3158. +       break;
  3159. +   case TGSI_OPCODE_UARL:
  3160. +       emit_alu(ALU_OP1_MOV, tgsi_dst(SEL_X), 0, asrc(args.src[0], 0));
  3161. +       break;
  3162. +   default:
  3163. +       assert(!"ti_arl: unexpected opcode");
  3164. +   }
  3165. +   return 0;
  3166. +}
  3167. +
  3168. +int tgsi_translator::ti_ssg() {
  3169. +   vvec t;
  3170. +   create_temps(t, 4);
  3171. +   if (info->tgsi_op == TGSI_OPCODE_SSG) {
  3172. +       FOREACH_CHAN
  3173. +       {
  3174. +           emit_alu(ALU_OP3_CNDGE, t[ch], 0, asrc(args.src[0], ch), asrc(0.0f),
  3175. +               asrc(-1.0f));
  3176. +       }
  3177. +       begin_group();
  3178. +       FOREACH_CHAN
  3179. +       {
  3180. +           emit_alu(ALU_OP3_CNDGT, tgsi_dst(ch), 0, asrc(args.src[0], ch),
  3181. +               asrc(1.0f), asrc(t[ch]));
  3182. +       }
  3183. +       end_group();
  3184. +   } else { // ISSG
  3185. +       FOREACH_CHAN
  3186. +       {
  3187. +           emit_alu(ALU_OP3_CNDGE_INT, t[ch], 0, asrc(args.src[0], ch),
  3188. +               asrc(0u), asrc(-1u));
  3189. +       }
  3190. +       begin_group();
  3191. +       FOREACH_CHAN
  3192. +       {
  3193. +           emit_alu(ALU_OP3_CNDGT_INT, tgsi_dst(ch), 0, asrc(args.src[0], ch),
  3194. +               asrc(1u), asrc(t[ch]));
  3195. +       }
  3196. +       end_group();
  3197. +   }
  3198. +   return 0;
  3199. +}
  3200. +
  3201. +int tgsi_translator::ti_cmp() {
  3202. +   begin_group();
  3203. +   FOREACH_CHAN
  3204. +   {
  3205. +       if (info->tgsi_op == TGSI_OPCODE_CMP)
  3206. +           emit_alu(ALU_OP3_CNDGE, tgsi_dst(ch), clamp, asrc(args.src[0], ch),
  3207. +               asrc(args.src[2], ch), asrc(args.src[1], ch));
  3208. +       else
  3209. +           emit_alu(ALU_OP3_CNDE_INT, tgsi_dst(ch), 0, asrc(args.src[0], ch),
  3210. +               asrc(args.src[2], ch), asrc(args.src[1], ch));
  3211. +   }
  3212. +   end_group();
  3213. +   return 0;
  3214. +}
  3215. +
  3216. +int tgsi_translator::ti_umad() {
  3217. +   vvec t;
  3218. +   create_temps(t, 4);
  3219. +
  3220. +   FOREACH_CHAN
  3221. +   {
  3222. +       emit_alu(ALU_OP2_MULLO_INT, t[ch], 0, asrc(args.src[0], ch),
  3223. +           asrc(args.src[1], ch));
  3224. +   }
  3225. +   begin_group();
  3226. +   FOREACH_CHAN
  3227. +   {
  3228. +       emit_alu(ALU_OP2_ADD_INT, tgsi_dst(ch), 0, asrc(t[ch]),
  3229. +           asrc(args.src[2], ch));
  3230. +   }
  3231. +   end_group();
  3232. +   return 0;
  3233. +}
  3234. +
  3235. +int tgsi_translator::ti_f2iu() {
  3236. +   vvec t;
  3237. +   create_temps(t, 4);
  3238. +   FOREACH_CHAN
  3239. +   {
  3240. +       emit_alu(ALU_OP1_TRUNC, t[ch], 0, asrc(args.src[0], ch));
  3241. +   }
  3242. +   begin_group();
  3243. +   FOREACH_CHAN
  3244. +   {
  3245. +       emit_alu(info->isa_op, tgsi_dst(ch), 0, t[ch]);
  3246. +   }
  3247. +   end_group();
  3248. +   return 0;
  3249. +}
  3250. +
  3251. +int tgsi_translator::ti_ineg() {
  3252. +   begin_group();
  3253. +   FOREACH_CHAN
  3254. +   {
  3255. +       emit_alu(ALU_OP2_SUB_INT, tgsi_dst(ch), 0, asrc(0u),
  3256. +           asrc(args.src[0], ch));
  3257. +   }
  3258. +   end_group();
  3259. +   return 0;
  3260. +}
  3261. +
  3262. +int tgsi_translator::ti_iabs() {
  3263. +   vvec t;
  3264. +   create_temps(t, 4);
  3265. +   FOREACH_CHAN
  3266. +   {
  3267. +       emit_alu(ALU_OP2_SUB_INT, t[ch], 0, asrc(0u), asrc(args.src[0], ch));
  3268. +   }
  3269. +   begin_group();
  3270. +   FOREACH_CHAN
  3271. +   {
  3272. +       emit_alu(ALU_OP3_CNDGE_INT, tgsi_dst(ch), 0, asrc(args.src[0], ch),
  3273. +           asrc(args.src[0], ch), asrc(t[ch]));
  3274. +   }
  3275. +   end_group();
  3276. +   return 0;
  3277. +}
  3278. +
  3279. +int tgsi_translator::ti_divmod() {
  3280. +
  3281. +   bool signed_op = false, mod = false;
  3282. +   switch (info->tgsi_op) {
  3283. +   case TGSI_OPCODE_MOD:
  3284. +       mod = true;
  3285. +   case TGSI_OPCODE_IDIV:
  3286. +       signed_op = true;
  3287. +       break;
  3288. +   case TGSI_OPCODE_UMOD:
  3289. +       mod = true;
  3290. +   case TGSI_OPCODE_UDIV:
  3291. +       break;
  3292. +   default:
  3293. +       assert(!"ti_divmod: unexpected tgsi opcode");
  3294. +   }
  3295. +
  3296. +   // TODO optimize for constant src1 (omit RECIP error correction)
  3297. +
  3298. +   value *t0x = create_temp();
  3299. +   value *t0y = create_temp();
  3300. +   value *t0z = create_temp();
  3301. +   value *t0w = create_temp();
  3302. +   value *t1x = create_temp();
  3303. +   value *t1y = create_temp();
  3304. +   value *t1z = create_temp();
  3305. +   value *t1w = create_temp();
  3306. +   value *t2x = create_temp();
  3307. +   value *t2y = create_temp();
  3308. +   value *t2z = create_temp();
  3309. +   value *t3x = create_temp();
  3310. +
  3311. +   vvec dst;
  3312. +   create_temps(dst, 4);
  3313. +
  3314. +   FOREACH_CHAN
  3315. +   {
  3316. +       if (signed_op) {
  3317. +           /* tmp2.x = -src0 */
  3318. +           emit_alu(ALU_OP2_SUB_INT, t2x, 0, asrc(0u), asrc(args.src[0], ch));
  3319. +           /* tmp2.y = -src1 */
  3320. +           emit_alu(ALU_OP2_SUB_INT, t2y, 0, asrc(0u), asrc(args.src[1], ch));
  3321. +           /* tmp2.z sign bit is set if src0 and src2 signs are different */
  3322. +           /* it will be a sign of the quotient */
  3323. +           if (!mod) {
  3324. +               emit_alu(ALU_OP2_XOR_INT, t2z, 0, asrc(args.src[0], ch),
  3325. +                   asrc(args.src[1], ch));
  3326. +           }
  3327. +           /* tmp2.x = |src0| */
  3328. +           emit_alu(ALU_OP3_CNDGE_INT, t2x, 0, asrc(args.src[0], ch),
  3329. +               asrc(args.src[0], ch), asrc(t2x));
  3330. +           /* tmp2.y = |src1| */
  3331. +           emit_alu(ALU_OP3_CNDGE_INT, t2y, 0, asrc(args.src[1], ch),
  3332. +               asrc(args.src[1], ch), asrc(t2y));
  3333. +       } else { // unsigned
  3334. +           // copy sources to the same temps as in signed variant just
  3335. +           // to simplify generation of further operations.
  3336. +           // copies will be propagated later anyway.
  3337. +           emit_alu(ALU_OP1_MOV, t2x, 0, asrc(args.src[0], ch));
  3338. +           emit_alu(ALU_OP1_MOV, t2y, 0, asrc(args.src[1], ch));
  3339. +       }
  3340. +
  3341. +       /* 1. tmp0.x = rcp_u (src2)      (2^32/src2 + e,  e - rounding error)*/
  3342. +       if (ctx.is_cayman()) {
  3343. +           /* tmp3.x = u2f(src2) */
  3344. +           emit_alu(ALU_OP1_UINT_TO_FLT, t3x, 0, asrc(t2y));
  3345. +           /* tmp0.x = recip(tmp3.x) */
  3346. +           emit_alu(ALU_OP1_RECIP_IEEE, t0x, 0, asrc(t3x));
  3347. +           /* tmp3.x = tmp0.x * float(0x4f800000) */
  3348. +           emit_alu(ALU_OP2_MUL, t3x, 0, asrc(t0x), asrc(0x4f800000u));
  3349. +           /* tmp0.x = f2u (tmp3.x) */
  3350. +           emit_alu(ALU_OP1_FLT_TO_UINT, t0x, 0, asrc(t3x));
  3351. +       } else {
  3352. +           /* tmp0.x = recip_uint src2 */
  3353. +           emit_alu(ALU_OP1_RECIP_UINT, t0x, 0, asrc(t2y));
  3354. +       }
  3355. +
  3356. +       /* 2. tmp0.z = lo (tmp0.x * src2) */
  3357. +       emit_alu(ALU_OP2_MULLO_UINT, t0z, 0, asrc(t0x), asrc(t2y));
  3358. +       /* 3. tmp0.w = -tmp0.z */
  3359. +       emit_alu(ALU_OP2_SUB_INT, t0w, 0, asrc(0u), asrc(t0z));
  3360. +       /* 4. tmp0.y = hi (tmp0.x * src2) */
  3361. +       emit_alu(ALU_OP2_MULHI_UINT, t0y, 0, asrc(t0x), asrc(t2y));
  3362. +       /* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z)    = abs(lo(rcp*src)) */
  3363. +       emit_alu(ALU_OP3_CNDE_INT, t0z, 0, asrc(t0y), asrc(t0w), asrc(t0z));
  3364. +       /* 6. tmp0.w = hi (tmp0.z * tmp0.x)    = e, rounding error */
  3365. +       emit_alu(ALU_OP2_MULHI_UINT, t0w, 0, asrc(t0z), asrc(t0x));
  3366. +       /* 7. tmp1.x = tmp0.x - tmp0.w */
  3367. +       emit_alu(ALU_OP2_SUB_INT, t1x, 0, asrc(t0x), asrc(t0w));
  3368. +       /* 8. tmp1.y = tmp0.x + tmp0.w */
  3369. +       emit_alu(ALU_OP2_ADD_INT, t1y, 0, asrc(t0x), asrc(t0w));
  3370. +       /* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
  3371. +       emit_alu(ALU_OP3_CNDE_INT, t0x, 0, asrc(t0y), asrc(t1y), asrc(t1x));
  3372. +       /* 10. tmp0.z = hi(tmp0.x * src1)     = q */
  3373. +       emit_alu(ALU_OP2_MULHI_UINT, t0z, 0, asrc(t0x), asrc(t2x));
  3374. +       /* 11. tmp0.y = lo (src2 * tmp0.z)     = src2*q = src1 - r */
  3375. +       emit_alu(ALU_OP2_MULLO_UINT, t0y, 0, asrc(t2y), asrc(t0z));
  3376. +       /* 12. tmp0.w = src1 - tmp0.y       = r */
  3377. +       emit_alu(ALU_OP2_SUB_INT, t0w, 0, asrc(t2x), asrc(t0y));
  3378. +       /* 13. tmp1.x = tmp0.w >= src2      = r >= src2 */
  3379. +       emit_alu(ALU_OP2_SETGE_UINT, t1x, 0, asrc(t0w), asrc(t2y));
  3380. +       /* 14. tmp1.y = src1 >= tmp0.y       = r >= 0 */
  3381. +       emit_alu(ALU_OP2_SETGE_UINT, t1y, 0, asrc(t2x), asrc(t0y));
  3382. +
  3383. +       if (mod) { /* UMOD */
  3384. +           /* 15. tmp1.z = tmp0.w - src2           = r - src2 */
  3385. +           emit_alu(ALU_OP2_SUB_INT, t1z, 0, asrc(t0w), asrc(t2y));
  3386. +           /* 16. tmp1.w = tmp0.w + src2           = r + src2 */
  3387. +           emit_alu(ALU_OP2_ADD_INT, t1w, 0, asrc(t0w), asrc(t2y));
  3388. +       } else { /* UDIV */
  3389. +           /* 15. tmp1.z = tmp0.z + 1       = q + 1       DIV */
  3390. +           emit_alu(ALU_OP2_ADD_INT, t1z, 0, asrc(t0z), asrc(1u));
  3391. +           /* 16. tmp1.w = tmp0.z - 1          = q - 1 */
  3392. +           emit_alu(ALU_OP2_ADD_INT, t1w, 0, asrc(t0z), asrc(-1u));
  3393. +       }
  3394. +
  3395. +       /* 17. tmp1.x = tmp1.x & tmp1.y */
  3396. +       emit_alu(ALU_OP2_AND_INT, t1x, 0, asrc(t1x), asrc(t1y));
  3397. +
  3398. +       if (mod) {
  3399. +           /* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z    MOD */
  3400. +           emit_alu(ALU_OP3_CNDE_INT, t0z, 0, asrc(t1x), asrc(t0w), asrc(t1z));
  3401. +       } else {
  3402. +           /* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z    DIV */
  3403. +           emit_alu(ALU_OP3_CNDE_INT, t0z, 0, asrc(t1x), asrc(t0z), asrc(t1z));
  3404. +       }
  3405. +
  3406. +       if (signed_op) {
  3407. +           /* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
  3408. +           emit_alu(ALU_OP3_CNDE_INT, t0z, 0, asrc(t1y), asrc(t1w), asrc(t0z));
  3409. +
  3410. +           /* fix the sign of the result */
  3411. +           /* tmp0.x = -tmp0.z */
  3412. +           emit_alu(ALU_OP2_SUB_INT, t0x, 0, asrc(0u), asrc(t0z));
  3413. +           if (mod) {
  3414. +               /* sign of the remainder is the same as the sign of src0 */
  3415. +               /* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
  3416. +               emit_alu(ALU_OP3_CNDGE_INT, dst[ch], 0, asrc(t2x),
  3417. +                   asrc(t0z), asrc(t0x));
  3418. +           } else {
  3419. +               /* fix the quotient sign (same as the sign of src0*src1) */
  3420. +               /* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
  3421. +               emit_alu(ALU_OP3_CNDGE_INT, dst[ch], 0, asrc(t2z),
  3422. +                   asrc(t0z), asrc(t0x));
  3423. +           }
  3424. +       } else { // unsigned
  3425. +           /* 19. dst = tmp1.y==0 ? tmp1.w : tmp0.z */
  3426. +           emit_alu(ALU_OP3_CNDE_INT, dst[ch], 0, asrc(t1y), asrc(t1w),
  3427. +               asrc(t0z));
  3428. +       }
  3429. +   }
  3430. +   begin_group();
  3431. +   FOREACH_CHAN
  3432. +   {
  3433. +       emit_alu(ALU_OP1_MOV, tgsi_dst(ch), 0, asrc(dst[ch]));
  3434. +   }
  3435. +   end_group();
  3436. +   return 0;
  3437. +}
  3438. +
  3439. +fetch_node* tgsi_translator::create_fetch(unsigned op) {
  3440. +   fetch_node *f = sh->create_fetch();
  3441. +   f->bc.set_op(op);
  3442. +   f->src.resize(4);
  3443. +   f->dst.resize(4);
  3444. +   VSWZ_XYZW(f->bc.dst_sel);
  3445. +   return f;
  3446. +}
  3447. +
  3448. +alu_node* tgsi_translator::build_alu(unsigned op, value *dst, int clamp,
  3449. +                                     value *s0, int s0abs, int s0neg, value *s1,
  3450. +                                     int s1abs, int s1neg, value *s2, int s2abs,
  3451. +                                     int s2neg) {
  3452. +
  3453. +   alu_node *a = create_alu(op);
  3454. +   unsigned nsrc = a->bc.op_ptr->src_count;
  3455. +
  3456. +   a->dst.push_back(dst);
  3457. +   a->bc.clamp = clamp;
  3458. +
  3459. +   if (nsrc >= 1) {
  3460. +       a->src.push_back(s0);
  3461. +       a->bc.src[0].neg = s0neg;
  3462. +       a->bc.src[0].abs = s0abs;
  3463. +       if (nsrc >= 2) {
  3464. +           a->src.push_back(s1);
  3465. +           a->bc.src[1].neg = s1neg;
  3466. +           a->bc.src[1].abs = s1abs;
  3467. +           if (nsrc == 3) {
  3468. +               a->src.push_back(s2);
  3469. +               a->bc.src[2].neg = s2neg;
  3470. +               a->bc.src[2].abs = s2abs;
  3471. +           }
  3472. +       }
  3473. +   }
  3474. +   return a;
  3475. +}
  3476. +
  3477. +inline value_kind tgsi_translator::file_to_value_kind(unsigned file) {
  3478. +   switch (file) {
  3479. +   case TGSI_FILE_INPUT:
  3480. +       return VLK_TGSI_INPUT;
  3481. +   case TGSI_FILE_OUTPUT:
  3482. +       return VLK_TGSI_OUTPUT;
  3483. +   case TGSI_FILE_TEMPORARY:
  3484. +       return VLK_TGSI_TEMP;
  3485. +   case TGSI_FILE_ADDRESS:
  3486. +       return VLK_TGSI_ADDR;
  3487. +   case TGSI_FILE_IMMEDIATE:
  3488. +       return VLK_CONST;
  3489. +   case TGSI_FILE_CONSTANT:
  3490. +       return VLK_KCACHE;
  3491. +   }
  3492. +   assert(!"unexpected tgsi file");
  3493. +   return VLK_INVALID;
  3494. +}
  3495. +
  3496. +inline alu_src tgsi_translator::asrc(value *v, int abs, int neg) {
  3497. +   return alu_src(v, abs, neg);
  3498. +}
  3499. +
  3500. +inline alu_src tgsi_translator::asrc(literal l, int abs, int neg) {
  3501. +   return alu_src(sh->get_const_value(l), abs, neg);
  3502. +}
  3503. +
  3504. +inline alu_src tgsi_translator::asrc(tgsi_arg& ta, int chan) {
  3505. +   return alu_src(get_arg_value(ta, chan), ta.abs, ta.neg);
  3506. +}
  3507. +
  3508. +inline alu_src tgsi_translator::asrc(tgsi_arg& ta, int chan, int abs, int neg) {
  3509. +   int sabs = ta.abs;
  3510. +   int sneg = ta.neg;
  3511. +
  3512. +   if (abs) {
  3513. +       sabs = 1;
  3514. +       sneg = 0;
  3515. +   }
  3516. +   if (neg) {
  3517. +       sneg = !sneg;
  3518. +   }
  3519. +   return alu_src(get_arg_value(ta, chan), sabs, sneg);
  3520. +}
  3521. +inline alu_src tgsi_translator::asrc(float f, int abs, int neg) {
  3522. +   return alu_src(sh->get_const_value(literal(f)), abs, neg);
  3523. +}
  3524. +inline alu_src tgsi_translator::asrc(uint32_t u, int abs, int neg) {
  3525. +   return alu_src(sh->get_const_value(literal(u)), abs, neg);
  3526. +}
  3527. +
  3528. +int tgsi_translator::ti_lit() {
  3529. +
  3530. +   value *tx = create_temp();
  3531. +   value *tz = create_temp();
  3532. +
  3533. +   if (write_mask & (1 << SEL_Z)) {
  3534. +       emit_alu(ALU_OP2_MAX, tx, 0, asrc(args.src[0], SEL_Y), asrc(0.0f));
  3535. +       emit_alu(ALU_OP1_LOG_CLAMPED, tz, 0, asrc(tx));
  3536. +       emit_alu(ALU_OP3_MUL_LIT, tx, 0, asrc(tz), asrc(args.src[0], SEL_W),
  3537. +                asrc(args.src[0], SEL_X));
  3538. +   }
  3539. +   begin_group();
  3540. +   if (write_mask & (1 << SEL_X))
  3541. +       emit_alu(ALU_OP1_MOV, tgsi_dst(SEL_X), 0, asrc(1.0f));
  3542. +   if (write_mask & (1 << SEL_Y))
  3543. +       emit_alu(ALU_OP2_MAX, tgsi_dst(SEL_Y), clamp, asrc(args.src[0], SEL_X),
  3544. +           asrc(0.0f));
  3545. +   if (write_mask & (1 << SEL_Z))
  3546. +       emit_alu(ALU_OP1_EXP_IEEE, tgsi_dst(SEL_Z), clamp, asrc(tx));
  3547. +   if (write_mask & (1 << SEL_W))
  3548. +       emit_alu(ALU_OP1_MOV, tgsi_dst(SEL_W), 0, asrc(1.0f));
  3549. +   end_group();
  3550. +   return 0;
  3551. +}
  3552. +
  3553. +int tgsi_translator::emit_alu(unsigned op, value* dst, int clamp, value* s0,
  3554. +                              int s0abs, int s0neg, value* s1, int s1abs,
  3555. +                              int s1neg, value* s2, int s2abs, int s2neg) {
  3556. +
  3557. +   unsigned slots = ctx.alu_slots(op);
  3558. +   int expand = 0, i;
  3559. +
  3560. +   assert(slots);
  3561. +   if (ctx.is_cayman() && (slots == AF_S || (slots & AF_CM_EXPAND)))
  3562. +       expand = 4;
  3563. +   if (expand) {
  3564. +       int chan_hint = dst ? dst->select.chan() : 0;
  3565. +       alu_packed_node *p = sh->create_alu_packed();
  3566. +       for (i = 0; i < expand; ++i) {
  3567. +           alu_node *a = build_alu(op, (i == chan_hint) ? dst : NULL, clamp,
  3568. +               s0, s0abs, s0neg, s1, s1abs, s1neg, s2, s2abs, s2neg);
  3569. +           a->bc.slot = i;
  3570. +           p->push_back(a);
  3571. +       }
  3572. +       emit_node(p);
  3573. +   } else {
  3574. +       alu_node *a = build_alu(op, dst, clamp, s0, s0abs, s0neg, s1, s1abs,
  3575. +           s1neg, s2, s2abs, s2neg);
  3576. +       emit_node(a);
  3577. +   }
  3578. +   return 0;
  3579. +}
  3580. +
  3581. +value* tgsi_translator::get_tgsi_value(value_kind kind, unsigned index,
  3582. +                                       unsigned chan) {
  3583. +   switch (kind) {
  3584. +   case VLK_REG:
  3585. +       return sh->get_gpr_value(true, index, chan, false);
  3586. +   case VLK_CONST:
  3587. +       return sh->get_const_value(literals[(index << 2) + chan]);
  3588. +   case VLK_KCACHE:
  3589. +       return sh->get_kcache_value(0, index, chan);
  3590. +   case VLK_TGSI_INPUT:
  3591. +   case VLK_TGSI_OUTPUT:
  3592. +   case VLK_TGSI_TEMP:
  3593. +   case VLK_TGSI_ADDR:
  3594. +       return sh->get_value(kind, sel_chan(index, chan));
  3595. +   default:
  3596. +       assert(!"unexpected value kind");
  3597. +   }
  3598. +   return NULL;
  3599. +}
  3600. +
  3601. +int tgsi_translator::update_pipe_shader() {
  3602. +   int i;
  3603. +
  3604. +   for (i = 0; i < ninput; ++i) {
  3605. +       r600_shader_io *p = &ps->shader.input[i];
  3606. +       shader_io *s = &input[i];
  3607. +       memcpy(p, &s->d, sizeof(r600_shader_io));
  3608. +   }
  3609. +
  3610. +   for (i = 0; i < noutput; ++i) {
  3611. +       r600_shader_io *p = &ps->shader.output[i];
  3612. +       shader_io *s = &output[i];
  3613. +       memcpy(p, &s->d, sizeof(r600_shader_io));
  3614. +   }
  3615. +
  3616. +   ps->shader.ninput = ninput;
  3617. +   ps->shader.noutput = noutput;
  3618. +
  3619. +   // XXX this seems unused in the driver after some changes,
  3620. +   // probably needs to be simply removed
  3621. +   ps->shader.nr_ps_max_color_exports = nr_ps_max_color_exports;
  3622. +
  3623. +   ps->shader.nr_ps_color_exports = nr_ps_color_exports;
  3624. +   ps->shader.clip_dist_write = clip_dist_write;
  3625. +   ps->shader.fs_write_all = fs_write_all;
  3626. +   ps->shader.processor_type = tgsi_proc;
  3627. +   ps->shader.uses_kill = uses_kill;
  3628. +   ps->shader.vs_out_misc_write = vs_out_misc_write;
  3629. +   ps->shader.vs_out_point_size = vs_out_point_size;
  3630. +   ps->shader.uses_tex_buffers = uses_tex_buffers;
  3631. +   ps->shader.has_txq_cube_array_z_comp = has_txq_cube_array_z_comp;
  3632. +   ps->shader.two_side = two_side;
  3633. +
  3634. +   return 0;
  3635. +}
  3636. +
  3637. +uint32_t tgsi_translator::get_immediate(sel_chan sc) {
  3638. +   return literals[sc - 1];
  3639. +}
  3640. +
  3641. +int tgsi_translator::ti_buffer_txq() {
  3642. +   int id = args.src[1].sel;
  3643. +
  3644. +   if (ctx.is_egcm())
  3645. +       emit_alu(ALU_OP1_MOV, tgsi_dst(SEL_X), 0,
  3646. +           asrc(
  3647. +               sh->get_kcache_value(R600_BUFFER_INFO_CONST_BUFFER, id >> 2,
  3648. +                   id & 3)));
  3649. +   else
  3650. +       emit_alu(ALU_OP1_MOV, tgsi_dst(SEL_X), 0,
  3651. +           asrc(
  3652. +               sh->get_kcache_value(R600_BUFFER_INFO_CONST_BUFFER, id << 1,
  3653. +                   1)));
  3654. +   return 0;
  3655. +}
  3656. +
  3657. +int tgsi_translator::ti_vtx_fetch() {
  3658. +
  3659. +   int id = args.src[1].sel, i;
  3660. +
  3661. +   fetch_node *f = create_fetch(FETCH_OP_VFETCH);
  3662. +   f->bc.resource_id = id + R600_MAX_CONST_BUFFERS;
  3663. +   f->bc.fetch_type = 2; /* VTX_FETCH_NO_INDEX_OFFSET */
  3664. +   f->bc.mega_fetch_count = 16;
  3665. +   f->bc.use_const_fields = 1;
  3666. +   f->bc.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
  3667. +
  3668. +   for (i = 0; i < 4; ++i) {
  3669. +       unsigned s = (write_mask & (1 << i)) ? i : SEL_MASK;
  3670. +       f->bc.dst_sel[i] = s;
  3671. +       if (s != SEL_MASK)
  3672. +           f->dst[i] = get_arg_value(0, i);
  3673. +   }
  3674. +
  3675. +   f->src = get_vector_values(args.src[0].kind, args.src[0].sel,
  3676. +       args.src[0].swz);
  3677. +
  3678. +   emit_node(f);
  3679. +
  3680. +   if (ctx.is_egcm())
  3681. +       return 0;
  3682. +
  3683. +   FOREACH_CHAN
  3684. +   {
  3685. +       emit_alu(ALU_OP2_AND_INT, f->dst[ch], 0, asrc(f->dst[ch]),
  3686. +           asrc(
  3687. +               sh->get_kcache_value(R600_BUFFER_INFO_CONST_BUFFER, id << 1,
  3688. +                   ch)));
  3689. +   }
  3690. +
  3691. +   if (write_mask & (1 << SEL_W)) {
  3692. +       emit_alu(ALU_OP2_AND_INT, f->dst[SEL_W], 0, asrc(f->dst[SEL_W]),
  3693. +           asrc(
  3694. +               sh->get_kcache_value(R600_BUFFER_INFO_CONST_BUFFER,
  3695. +                   1 + (id << 1), 0)));
  3696. +   }
  3697. +   return 0;
  3698. +}
  3699. +
  3700. +int tgsi_translator::ti_tex() {
  3701. +   unsigned tgsi_op = info->tgsi_op;
  3702. +   unsigned texture = inst->Texture.Texture;
  3703. +   unsigned sampler_src_reg = (tgsi_op == TGSI_OPCODE_TXQ_LZ) ? 0 : 1;
  3704. +   int8_t offset[3] = { };
  3705. +   int opcode, i;
  3706. +   vvec src;
  3707. +
  3708. +   bool read_compressed_msaa = ps->shader.bc.has_compressed_msaa_texturing
  3709. +           && tgsi_op == TGSI_OPCODE_TXF
  3710. +           && (texture == TGSI_TEXTURE_2D_MSAA
  3711. +                   || texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
  3712. +
  3713. +   if (tgsi_op == TGSI_OPCODE_TXQ
  3714. +           && ((texture == TGSI_TEXTURE_CUBE_ARRAY
  3715. +                   || texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
  3716. +       if (write_mask & (1 << SEL_Z))
  3717. +           has_txq_cube_array_z_comp = true;
  3718. +
  3719. +   if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2
  3720. +           || inst->Instruction.Opcode == TGSI_OPCODE_TXB2
  3721. +           || inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
  3722. +       sampler_src_reg = 2;
  3723. +
  3724. +   if (texture == TGSI_TEXTURE_BUFFER) {
  3725. +       if (tgsi_op == TGSI_OPCODE_TXQ) {
  3726. +           uses_tex_buffers = true;
  3727. +           return ti_buffer_txq();
  3728. +       } else if (tgsi_op == TGSI_OPCODE_TXF) {
  3729. +           if (!ctx.is_egcm())
  3730. +               uses_tex_buffers = true;
  3731. +           return ti_vtx_fetch();
  3732. +       }
  3733. +   }
  3734. +
  3735. +   if (tgsi_op == TGSI_OPCODE_TXF) {
  3736. +       /* get offset values */
  3737. +       if (inst->Texture.NumOffsets) {
  3738. +           assert(inst->Texture.NumOffsets == 1);
  3739. +           offset[0] = literals[inst->TexOffsets[0].Index
  3740. +                   + inst->TexOffsets[0].SwizzleX] << 1;
  3741. +           offset[1] = literals[inst->TexOffsets[0].Index
  3742. +                   + inst->TexOffsets[0].SwizzleY] << 1;
  3743. +           offset[2] = literals[inst->TexOffsets[0].Index
  3744. +                   + inst->TexOffsets[0].SwizzleZ] << 1;
  3745. +       }
  3746. +   } else if (tgsi_op == TGSI_OPCODE_TXP) {
  3747. +       /* Add perspective divide */
  3748. +       value *t = create_temp();
  3749. +       vvec t2;
  3750. +       create_temps(t2, 4);
  3751. +
  3752. +       emit_alu(ALU_OP1_RECIP_IEEE, t, 0, asrc(args.src[0], SEL_W));
  3753. +       for (i = 0; i < 3; i++)
  3754. +           emit_alu(ALU_OP2_MUL, t2[i], 0, asrc(t), asrc(args.src[0], i));
  3755. +       emit_alu(ALU_OP1_MOV, t2[SEL_W], 0, asrc(1.0f));
  3756. +       src = t2;
  3757. +   }
  3758. +
  3759. +   if ((texture == TGSI_TEXTURE_CUBE || texture == TGSI_TEXTURE_CUBE_ARRAY
  3760. +           || texture == TGSI_TEXTURE_SHADOWCUBE
  3761. +           || texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
  3762. +           && tgsi_op != TGSI_OPCODE_TXQ && tgsi_op != TGSI_OPCODE_TXQ_LZ) {
  3763. +
  3764. +       vvec tv;
  3765. +       static const int cube_swizzle[] = { SEL_Z, SEL_Z, SEL_X, SEL_Y };
  3766. +       alu_packed_node *p = sh->create_alu_packed();
  3767. +
  3768. +       create_temps(tv, 4);
  3769. +       for (i = 0; i < 4; ++i) {
  3770. +           alu_node *a = build_alu(ALU_OP2_CUBE, tv[i], 0,
  3771. +               asrc(args.src[0], cube_swizzle[i]),
  3772. +               asrc(args.src[0], cube_swizzle[3 - i]));
  3773. +           a->bc.slot = i;
  3774. +           p->push_back(a);
  3775. +       }
  3776. +       emit_node(p);
  3777. +
  3778. +       emit_alu(ALU_OP1_RECIP_IEEE, tv[SEL_Z], 0, asrc(tv[SEL_Z], 1));
  3779. +       emit_alu(ALU_OP3_MULADD, tv[SEL_X], 0, asrc(tv[SEL_X]), asrc(tv[SEL_Z]),
  3780. +           asrc(1.5f));
  3781. +       emit_alu(ALU_OP3_MULADD, tv[SEL_Y], 0, asrc(tv[SEL_Y]), asrc(tv[SEL_Z]),
  3782. +           asrc(1.5f));
  3783. +
  3784. +       /* write initial compare value into Z component
  3785. +        - W src 0 for shadow cube
  3786. +        - X src 1 for shadow cube array */
  3787. +       if (texture == TGSI_TEXTURE_SHADOWCUBE)
  3788. +           emit_alu(ALU_OP1_MOV, tv[SEL_Z], 0, asrc(args.src[0], SEL_W));
  3789. +       else if (texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
  3790. +           emit_alu(ALU_OP1_MOV, tv[SEL_Z], 0, asrc(args.src[1], SEL_X));
  3791. +
  3792. +       if (texture == TGSI_TEXTURE_CUBE_ARRAY
  3793. +               || texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
  3794. +           if (ctx.is_egcm()) {
  3795. +               emit_alu(ALU_OP3_MULADD, tv[SEL_W], 0, asrc(args.src[0], SEL_W),
  3796. +                   asrc(8.0f), asrc(tv[SEL_W]));
  3797. +           } else {
  3798. +               fetch_node *f = create_fetch(FETCH_OP_SET_CUBEMAP_INDEX);
  3799. +               f->bc.sampler_id = args.src[sampler_src_reg].sel;
  3800. +               f->bc.resource_id = f->bc.sampler_id + R600_MAX_CONST_BUFFERS;
  3801. +
  3802. +               FILLV4(f->src, get_arg_value(1, SEL_W));
  3803. +               FILLV4(f->bc.coord_type, 1);
  3804. +
  3805. +               emit_node(f);
  3806. +           }
  3807. +       }
  3808. +
  3809. +       /* for cube forms of lod and bias we need to route things */
  3810. +       // XXX just copy target value pointer to src?
  3811. +       if (tgsi_op == TGSI_OPCODE_TXB || tgsi_op == TGSI_OPCODE_TXL)
  3812. +           emit_alu(ALU_OP1_MOV, tv[SEL_Z], 0, asrc(args.src[0], SEL_W));
  3813. +       else if (tgsi_op == TGSI_OPCODE_TXB2 || tgsi_op == TGSI_OPCODE_TXL2)
  3814. +           emit_alu(ALU_OP1_MOV, tv[SEL_Z], 0, asrc(args.src[1], SEL_X));
  3815. +
  3816. +       src = tv;
  3817. +   }
  3818. +
  3819. +   if (src.empty()) {
  3820. +       src = get_vector_values(args.src[0].kind, args.src[0].sel,
  3821. +           args.src[0].swz);
  3822. +   }
  3823. +
  3824. +   /* Obtain the sample index for reading a compressed MSAA color texture.
  3825. +    * To read the FMASK, we use the ldfptr instruction, which tells us
  3826. +    * where the samples are stored.
  3827. +    * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
  3828. +    * which is the identity mapping. Each nibble says which physical sample
  3829. +    * should be fetched to get that sample.
  3830. +    *
  3831. +    * Assume src.z contains the sample index. It should be modified like this:
  3832. +    *   src.z = (ldfptr() >> (src.z * 4)) & 0xF;
  3833. +    * Then fetch the texel with src.
  3834. +    */
  3835. +   if (read_compressed_msaa) {
  3836. +       fetch_node *f = create_fetch(FETCH_OP_LD);
  3837. +
  3838. +       f->bc.inst_mod = 1; /* LDFPTR */
  3839. +       f->bc.sampler_id = args.src[sampler_src_reg].sel;
  3840. +       f->bc.resource_id = f->bc.sampler_id + R600_MAX_CONST_BUFFERS;
  3841. +
  3842. +       value* tw = create_temp();
  3843. +
  3844. +       f->src = src;
  3845. +       f->dst[3] = tw;
  3846. +
  3847. +       VSWZ_INIT(f->bc.dst_sel, SEL_MASK, SEL_MASK, SEL_MASK, SEL_X);
  3848. +
  3849. +       f->bc.offset[0] = offset[0];
  3850. +       f->bc.offset[1] = offset[1];
  3851. +       f->bc.offset[2] = offset[2];
  3852. +
  3853. +       emit_node(f);
  3854. +
  3855. +       value *tx = create_temp();
  3856. +       emit_alu(ALU_OP2_MULLO_INT, tx, 0, asrc(src[3]), asrc(4u));
  3857. +       emit_alu(ALU_OP2_LSHR_INT, src[3], 0, asrc(tw), asrc(tx));
  3858. +       emit_alu(ALU_OP2_AND_INT, src[3], 0, asrc(src[3]), asrc(0xFu));
  3859. +   }
  3860. +
  3861. +   /* does this shader want a num layers from TXQ for a cube array? */
  3862. +   if (has_txq_cube_array_z_comp) {
  3863. +       int id = args.src[sampler_src_reg].sel;
  3864. +
  3865. +       emit_alu(
  3866. +       ALU_OP1_MOV, tgsi_dst(SEL_Z), 0,
  3867. +           asrc(sh->get_kcache_value(R600_TXQ_CONST_BUFFER, id >> 2, id & 3)));
  3868. +
  3869. +       /* disable writemask from texture instruction */
  3870. +       write_mask &= ~(1 << SEL_Z);
  3871. +   }
  3872. +
  3873. +   opcode = info->isa_op;
  3874. +   if (texture == TGSI_TEXTURE_SHADOW1D || texture == TGSI_TEXTURE_SHADOW2D
  3875. +           || texture == TGSI_TEXTURE_SHADOWRECT
  3876. +           || texture == TGSI_TEXTURE_SHADOWCUBE
  3877. +           || texture == TGSI_TEXTURE_SHADOW1D_ARRAY
  3878. +           || texture == TGSI_TEXTURE_SHADOW2D_ARRAY
  3879. +           || texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
  3880. +       switch (opcode) {
  3881. +       case FETCH_OP_SAMPLE:
  3882. +           opcode = FETCH_OP_SAMPLE_C;
  3883. +           break;
  3884. +       case FETCH_OP_SAMPLE_L:
  3885. +           opcode = FETCH_OP_SAMPLE_C_L;
  3886. +           break;
  3887. +       case FETCH_OP_SAMPLE_LB:
  3888. +           opcode = FETCH_OP_SAMPLE_C_LB;
  3889. +           break;
  3890. +       case FETCH_OP_SAMPLE_G:
  3891. +           opcode = FETCH_OP_SAMPLE_C_G;
  3892. +           break;
  3893. +       }
  3894. +   }
  3895. +
  3896. +   fetch_node *f = create_fetch(opcode);
  3897. +   f->src = src;
  3898. +
  3899. +   if (tgsi_op == TGSI_OPCODE_TXD) {
  3900. +       vvec d;
  3901. +       for (i = 2; i > 0; --i) {
  3902. +           d = get_vector_values(args.src[i].kind, args.src[i].sel,
  3903. +               args.src[i].swz);
  3904. +           f->src.insert(f->src.end(), d.begin(), d.end());
  3905. +       }
  3906. +       sampler_src_reg = 3;
  3907. +   }
  3908. +
  3909. +   f->bc.sampler_id = args.src[sampler_src_reg].sel;
  3910. +   f->bc.resource_id = f->bc.sampler_id + R600_MAX_CONST_BUFFERS;
  3911. +
  3912. +   for (i = 0; i < 4; ++i) {
  3913. +       unsigned s = (write_mask & (1 << i)) ? i : SEL_MASK;
  3914. +       f->bc.dst_sel[i] = s;
  3915. +       if (s != SEL_MASK)
  3916. +           f->dst[i] = get_arg_value(0, i);
  3917. +   }
  3918. +
  3919. +   if (tgsi_op == TGSI_OPCODE_TXQ_LZ) {
  3920. +       FILLV4(f->src, sh->get_const_value(literal(0)));
  3921. +   }
  3922. +
  3923. +   if (texture == TGSI_TEXTURE_CUBE || texture == TGSI_TEXTURE_SHADOWCUBE
  3924. +           || texture == TGSI_TEXTURE_CUBE_ARRAY
  3925. +           || texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
  3926. +       f->src[0] = src[SEL_Y];
  3927. +       f->src[1] = src[SEL_X];
  3928. +       f->src[2] = src[SEL_W];
  3929. +       f->src[3] = src[SEL_Z]; /* route Z compare or Lod value into W */
  3930. +   }
  3931. +
  3932. +   if (texture != TGSI_TEXTURE_RECT && texture != TGSI_TEXTURE_SHADOWRECT) {
  3933. +       f->bc.coord_type[0] = f->bc.coord_type[1] = 1;
  3934. +   }
  3935. +   f->bc.coord_type[2] = f->bc.coord_type[3] = 1;
  3936. +
  3937. +   f->bc.offset[0] = offset[0];
  3938. +   f->bc.offset[1] = offset[1];
  3939. +   f->bc.offset[2] = offset[2];
  3940. +
  3941. +   /* Put the depth for comparison in W.
  3942. +    * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
  3943. +    * Some instructions expect the depth in Z. */
  3944. +   if ((texture == TGSI_TEXTURE_SHADOW1D || texture == TGSI_TEXTURE_SHADOW2D
  3945. +           || texture == TGSI_TEXTURE_SHADOWRECT
  3946. +           || texture == TGSI_TEXTURE_SHADOW1D_ARRAY)
  3947. +           && opcode != FETCH_OP_SAMPLE_C_L && opcode != FETCH_OP_SAMPLE_C_LB) {
  3948. +       f->src[SEL_W] = f->src[SEL_Z];
  3949. +   }
  3950. +
  3951. +   if (texture == TGSI_TEXTURE_1D_ARRAY
  3952. +           || texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
  3953. +       if (opcode == FETCH_OP_SAMPLE_C_L || opcode == FETCH_OP_SAMPLE_C_LB) {
  3954. +           /* the array index is read from Y */
  3955. +           f->bc.coord_type[SEL_Y] = 0;
  3956. +       } else {
  3957. +           /* the array index is read from Z */
  3958. +           f->bc.coord_type[SEL_Z] = 0;
  3959. +           f->src[SEL_Z] = f->src[SEL_Y];
  3960. +       }
  3961. +   } else if (texture == TGSI_TEXTURE_2D_ARRAY
  3962. +           || texture == TGSI_TEXTURE_SHADOW2D_ARRAY
  3963. +           || ((texture == TGSI_TEXTURE_CUBE_ARRAY
  3964. +                   || texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
  3965. +                   && ctx.is_egcm()))
  3966. +       /* the array index is read from Z */
  3967. +       f->bc.coord_type[SEL_Z] = 0;
  3968. +
  3969. +   /* mask unused source components */
  3970. +   if (opcode == FETCH_OP_SAMPLE) {
  3971. +       switch (texture) {
  3972. +       case TGSI_TEXTURE_2D:
  3973. +       case TGSI_TEXTURE_RECT:
  3974. +           f->src[SEL_Z] = NULL;
  3975. +           f->src[SEL_W] = NULL;
  3976. +           break;
  3977. +       case TGSI_TEXTURE_1D_ARRAY:
  3978. +           f->src[SEL_Y] = NULL;
  3979. +           f->src[SEL_W] = NULL;
  3980. +           break;
  3981. +       case TGSI_TEXTURE_1D:
  3982. +           f->src[SEL_Y] = NULL;
  3983. +           f->src[SEL_Z] = NULL;
  3984. +           f->src[SEL_W] = NULL;
  3985. +           break;
  3986. +       }
  3987. +   }
  3988. +
  3989. +   emit_node(f);
  3990. +
  3991. +   /* add shadow ambient support  - gallium doesn't do it yet */
  3992. +   return 0;
  3993. +}
  3994. +
  3995. +int tgsi_translator::ti_if() {
  3996. +   alu_node *a = build_alu(info->isa_op, sh->get_special_value(SV_EXEC_MASK),
  3997. +       0, asrc(args.src[0], 0), asrc(0.0f));
  3998. +
  3999. +   a->dst.insert(a->dst.begin(), 2, (value*)NULL);
  4000. +   emit_node(a);
  4001. +
  4002. +   region_node *r = sh->create_region();
  4003. +   depart_node *d_true = sh->create_depart(r);
  4004. +   depart_node *d_false = sh->create_depart(r);
  4005. +   if_node *i = sh->create_if();
  4006. +
  4007. +   i->cond = sh->get_special_value(SV_EXEC_MASK);
  4008. +   r->push_back(d_false);
  4009. +   d_false->push_back(i);
  4010. +   i->push_back(d_true);
  4011. +   emit_node(r);
  4012. +   if_stack.push(r);
  4013. +   current = d_true;
  4014. +   return 0;
  4015. +}
  4016. +
  4017. +int tgsi_translator::ti_else() {
  4018. +   assert(!if_stack.empty());
  4019. +   region_node *r = if_stack.top();
  4020. +   depart_node *d_false = static_cast<depart_node*>(r->first);
  4021. +   assert(d_false && d_false->is_depart());
  4022. +   if_node *i = static_cast<if_node*>(d_false->first);
  4023. +   assert(i && i->is_if());
  4024. +   assert(!i->next);
  4025. +   container_node *c = sh->create_container();
  4026. +
  4027. +   i->insert_after(c);
  4028. +   current = c;
  4029. +   return 0;
  4030. +}
  4031. +
  4032. +int tgsi_translator::ti_endif() {
  4033. +   assert(!if_stack.empty());
  4034. +   region_node *r = if_stack.top();
  4035. +   depart_node *d = static_cast<depart_node*>(r->first);
  4036. +   assert(d && d->is_depart());
  4037. +   if_node *i = static_cast<if_node*>(d->first);
  4038. +   assert(i && i->is_if());
  4039. +   container_node *c = static_cast<container_node*>(i->next);
  4040. +   assert(!c || (current == c && c->is_container()));
  4041. +
  4042. +   if (c)
  4043. +       c->expand();
  4044. +
  4045. +   current = r->parent;
  4046. +   if_stack.pop();
  4047. +   return 0;
  4048. +}
  4049. +
  4050. +int tgsi_translator::ti_begin_loop() {
  4051. +   region_node *r = sh->create_region();
  4052. +   repeat_node *d = sh->create_repeat(r);
  4053. +
  4054. +   r->push_back(d);
  4055. +   emit_node(r);
  4056. +   loop_stack.push(r);
  4057. +   current = d;
  4058. +   return 0;
  4059. +}
  4060. +
  4061. +int tgsi_translator::ti_loop_op() {
  4062. +   assert(!loop_stack.empty());
  4063. +   region_node *r = loop_stack.top();
  4064. +   container_node *rd;
  4065. +
  4066. +   if (info->isa_op == CF_OP_LOOP_CONTINUE)
  4067. +       rd = sh->create_repeat(r);
  4068. +   else
  4069. +       rd = sh->create_depart(r);
  4070. +   if (!current->empty())
  4071. +       rd->move(current->begin(), current->end());
  4072. +   emit_node(rd);
  4073. +   sh->simplify_dep_rep(rd);
  4074. +   return 0;
  4075. +}
  4076. +
  4077. +int tgsi_translator::ti_end_loop() {
  4078. +   assert(!loop_stack.empty());
  4079. +   region_node *r = loop_stack.top();
  4080. +
  4081. +   current = r->parent;
  4082. +   loop_stack.pop();
  4083. +   return 0;
  4084. +}
  4085. +
  4086. +int tgsi_translator::split_src_arg(tgsi_arg &ta) {
  4087. +   int k;
  4088. +   vvec t;
  4089. +   create_temps(t, 4);
  4090. +
  4091. +   for (k = 0; k < 4; ++k) {
  4092. +       emit_alu(ALU_OP1_MOV, t[k], 0, asrc(get_arg_value(ta, k)));
  4093. +   }
  4094. +   ta.rel = 0;
  4095. +   ta.values = t;
  4096. +   ta.kind = VLK_TEMP;
  4097. +   return 0;
  4098. +}
  4099. +
  4100. +int tgsi_translator::fetch_rel_const(tgsi_arg& ta) {
  4101. +   int i;
  4102. +   value* t = create_temp();
  4103. +   value* addr = get_tgsi_value(VLK_TGSI_ADDR, ta.rel_addr_index, 0);
  4104. +   emit_alu(ALU_OP2_ADD_INT, t, 0, asrc(addr), asrc((unsigned) ta.sel));
  4105. +
  4106. +   vvec r;
  4107. +   create_temps(r, 4);
  4108. +
  4109. +   fetch_node *f = create_fetch(FETCH_OP_VFETCH);
  4110. +   f->bc.resource_id = ta.kc_bank;
  4111. +   f->bc.fetch_type = 2; /* VTX_FETCH_NO_INDEX_OFFSET */
  4112. +   f->bc.mega_fetch_count = 16;
  4113. +   f->bc.data_format = 0x23; // FMT_32_32_32_32_FLOAT;
  4114. +   f->bc.num_format_all = 2; /* NUM_FORMAT_SCALED */
  4115. +   f->bc.format_comp_all = 1; /* FORMAT_COMP_SIGNED */
  4116. +   f->bc.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
  4117. +   f->bc.endian_swap = r600_endian_swap(32);
  4118. +   f->src[0] = t;
  4119. +   f->dst = r;
  4120. +
  4121. +   for (i = 0; i < 4; ++i) {
  4122. +       f->bc.dst_sel[i] = ta.swz[i];
  4123. +   }
  4124. +
  4125. +   emit_node(f);
  4126. +   ta.values = r;
  4127. +   ta.rel = 0;
  4128. +   ta.kind = VLK_TEMP;
  4129. +   return 0;
  4130. +}
  4131. +
  4132. +} // namespace r600_sb
  4133. diff --git a/src/gallium/drivers/r600/sb/sb_tgsi.h b/src/gallium/drivers/r600/sb/sb_tgsi.h
  4134. new file mode 100644
  4135. index 0000000..bbfb115
  4136. --- /dev/null
  4137. +++ b/src/gallium/drivers/r600/sb/sb_tgsi.h
  4138. @@ -0,0 +1,331 @@
  4139. +/*
  4140. + * Copyright 2013 Vadim Girlin <vadimgirlin@gmail.com>
  4141. + *
  4142. + * Permission is hereby granted, free of charge, to any person obtaining a
  4143. + * copy of this software and associated documentation files (the "Software"),
  4144. + * to deal in the Software without restriction, including without limitation
  4145. + * on the rights to use, copy, modify, merge, publish, distribute, sub
  4146. + * license, and/or sell copies of the Software, and to permit persons to whom
  4147. + * the Software is furnished to do so, subject to the following conditions:
  4148. + *
  4149. + * The above copyright notice and this permission notice (including the next
  4150. + * paragraph) shall be included in all copies or substantial portions of the
  4151. + * Software.
  4152. + *
  4153. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  4154. + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  4155. + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  4156. + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  4157. + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  4158. + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  4159. + * USE OR OTHER DEALINGS IN THE SOFTWARE.
  4160. + *
  4161. + * Authors:
  4162. + *      Vadim Girlin
  4163. + */
  4164. +
  4165. +
  4166. +#ifndef SB_TGSI_TRANSLATOR_H_
  4167. +#define SB_TGSI_TRANSLATOR_H_
  4168. +
  4169. +extern "C" {
  4170. +#include "tgsi/tgsi_info.h"
  4171. +#include "tgsi/tgsi_parse.h"
  4172. +#include "tgsi/tgsi_scan.h"
  4173. +}
  4174. +
  4175. +namespace r600_sb {
  4176. +
  4177. +struct alu_src {
  4178. +   value *v;
  4179. +   int abs;
  4180. +   int neg;
  4181. +
  4182. +   alu_src(value *v = NULL, int abs = 0, int neg = 0)
  4183. +       : v(v), abs(abs), neg(neg) {}
  4184. +};
  4185. +
  4186. +class tgsi_translator {
  4187. +
  4188. +   static const int max_io = 40;
  4189. +
  4190. +   struct shader_io {
  4191. +       r600_shader_io d;
  4192. +
  4193. +       unsigned tgsi_index;
  4194. +       unsigned use_mask;
  4195. +   };
  4196. +
  4197. +   struct tgsi_arg {
  4198. +       int file;
  4199. +       value_kind kind;
  4200. +       int sel;
  4201. +       int rel;
  4202. +       int abs;
  4203. +       int neg;
  4204. +       int rel_addr_index;
  4205. +       int rel_addr_chan;
  4206. +       int rel_array_id;
  4207. +       int kc_bank;
  4208. +       unsigned swz[4];
  4209. +       vvec values;
  4210. +       bool dst;
  4211. +   };
  4212. +
  4213. +   struct tgsi_args {
  4214. +       tgsi_arg dst;
  4215. +       tgsi_arg src[TGSI_FULL_MAX_SRC_REGISTERS];
  4216. +       int nsrc;
  4217. +   };
  4218. +
  4219. +/* struct cf_stack_elem {
  4220. +       region_node *r;
  4221. +   };
  4222. +*/
  4223. +   typedef int (tgsi_translator::*tgsi_inst_func)();
  4224. +   enum tgsi_inst_flags {
  4225. +       TIF_ALU_SWAPSRC01 = (1 << 0)
  4226. +
  4227. +   };
  4228. +
  4229. +   struct tgsi_inst_info {
  4230. +       const char *name;
  4231. +       unsigned tgsi_op;
  4232. +       int isa_op;
  4233. +       tgsi_inst_func func;
  4234. +       unsigned flags;
  4235. +   };
  4236. +
  4237. +   static const tgsi_inst_info tgsi_info_table[TGSI_OPCODE_LAST];
  4238. +   static alu_src null_alu_src;
  4239. +
  4240. +   typedef int *emit_func();
  4241. +
  4242. +   sb_context &ctx;
  4243. +   shader *sh;
  4244. +   unsigned shader_id;
  4245. +
  4246. +   r600_pipe_shader *ps;
  4247. +   r600_shader_key key;
  4248. +
  4249. +   shader_io input[max_io];
  4250. +   shader_io output[max_io];
  4251. +
  4252. +   int ninput;
  4253. +   int noutput;
  4254. +
  4255. +   tgsi_token* tokens;
  4256. +   tgsi_parse_context parse;
  4257. +   tgsi_full_instruction *inst;
  4258. +   const tgsi_inst_info *info;
  4259. +
  4260. +   int face_input;
  4261. +   int fragcoord_input;
  4262. +   int colors_used;
  4263. +   int clip_vertex_write;
  4264. +   int cv_output;
  4265. +   int nr_ps_max_color_exports;
  4266. +   int nr_ps_color_exports;
  4267. +   int vs_out_misc_write;
  4268. +   int vs_out_point_size;
  4269. +   int uses_tex_buffers;
  4270. +   int has_txq_cube_array_z_comp;
  4271. +
  4272. +   // XXX probably unused now
  4273. +   unsigned indirect_vlk;
  4274. +
  4275. +   int instanceid_index;
  4276. +   int vertexid_index;
  4277. +
  4278. +   boolean two_side;
  4279. +
  4280. +   unsigned clip_dist_write;
  4281. +   unsigned fs_write_all;
  4282. +   unsigned uses_kill;
  4283. +   unsigned tgsi_proc;
  4284. +   unsigned interp_mask;
  4285. +   unsigned file_offset[TGSI_FILE_COUNT];
  4286. +
  4287. +   container_node *current;
  4288. +   std::stack<region_node*> if_stack;
  4289. +   std::stack<region_node*> loop_stack;
  4290. +
  4291. +   unsigned write_mask;
  4292. +   int clamp;
  4293. +
  4294. +   tgsi_args args;
  4295. +
  4296. +   std::vector<uint32_t> literals;
  4297. +
  4298. +public:
  4299. +
  4300. +   tgsi_translator(sb_context &ctx, r600_pipe_shader *ps, r600_shader_key key,
  4301. +                   unsigned shader_id)
  4302. +       : ctx(ctx), sh(), shader_id(shader_id), ps(ps), key(key),
  4303. +         input(), output(), ninput(), noutput(),
  4304. +         tokens(), parse(),
  4305. +         face_input(-1), fragcoord_input(-1), colors_used(),
  4306. +         clip_vertex_write(), cv_output(), nr_ps_max_color_exports(),
  4307. +         nr_ps_color_exports(),
  4308. +         vs_out_misc_write(), vs_out_point_size(), uses_tex_buffers(),
  4309. +         has_txq_cube_array_z_comp(), indirect_vlk(),
  4310. +         instanceid_index(-1), vertexid_index(-1), two_side(),
  4311. +         clip_dist_write(), fs_write_all(), uses_kill(),
  4312. +         tgsi_proc(), interp_mask(),
  4313. +         file_offset(), current(),
  4314. +         if_stack(), loop_stack(), write_mask(), clamp(), args() {}
  4315. +
  4316. +   shader* translate();
  4317. +
  4318. +private:
  4319. +
  4320. +   int spi_sid(int name, int sid);
  4321. +
  4322. +   int parse_declarations();
  4323. +   int parse_instructions();
  4324. +
  4325. +   int parse_property();
  4326. +   int parse_declaration();
  4327. +   int parse_immediate();
  4328. +   int parse_instruction();
  4329. +
  4330. +   int split_src_arg(tgsi_arg &ta);
  4331. +
  4332. +   int emit_inputs();
  4333. +   int get_ij(shader_io &in);
  4334. +   alu_packed_node* build_interp(shader_io& in, unsigned type);
  4335. +   alu_group_node* build_interp_flat(shader_io& in);
  4336. +
  4337. +   int emit_exports();
  4338. +   int emit_fake_export(unsigned type);
  4339. +   int emit_export(shader_io &o, unsigned type, unsigned base, unsigned *swz,
  4340. +                   unsigned tgsi_index);
  4341. +
  4342. +   int update_pipe_shader();
  4343. +
  4344. +   value* get_arg_value(unsigned index, unsigned chan);
  4345. +   value* get_arg_value(tgsi_arg &ta, unsigned chan);
  4346. +
  4347. +   value* get_tgsi_value(value_kind kind, unsigned index, unsigned chan);
  4348. +
  4349. +   value* tgsi_dst(unsigned chan) { return get_arg_value(0, chan); }
  4350. +   value_kind file_to_value_kind(unsigned file);
  4351. +
  4352. +   alu_node* create_alu(unsigned op);
  4353. +
  4354. +   alu_node* build_alu(unsigned op, int chan, int dstchan = -1);
  4355. +
  4356. +   alu_node* build_alu(unsigned op, value *dst, int clamp,
  4357. +                       value *s0, int s0abs, int s0neg,
  4358. +                       value *s1, int s1abs, int s1neg,
  4359. +                       value *s2, int s2abs, int s2neg);
  4360. +
  4361. +   alu_node* build_alu(unsigned op, value *dst, int clamp,
  4362. +                       const alu_src& s0) {
  4363. +       return build_alu(op, dst, clamp, s0.v, s0.abs, s0.neg, NULL, 0, 0,
  4364. +                        NULL, 0, 0);
  4365. +   }
  4366. +   alu_node* build_alu(unsigned op, value *dst, int clamp,
  4367. +                       const alu_src& s0, const alu_src& s1) {
  4368. +       return build_alu(op, dst, clamp, s0.v, s0.abs, s0.neg,
  4369. +                        s1.v, s1.abs, s1.neg, NULL, 0, 0);
  4370. +   }
  4371. +   alu_node* build_alu(unsigned op, value *dst, int clamp,
  4372. +                       const alu_src& s0, const alu_src& s1,
  4373. +                       const alu_src& s2) {
  4374. +       return build_alu(op, dst, clamp, s0.v, s0.abs, s0.neg,
  4375. +                        s1.v, s1.abs, s1.neg, s2.v, s2.abs, s2.neg);
  4376. +   }
  4377. +
  4378. +   int emit_alu(unsigned op, int chan, int dstchan = -1);
  4379. +
  4380. +   int emit_alu(unsigned op, value *dst, int clamp,
  4381. +                       value *s0, int s0abs, int s0neg,
  4382. +                       value *s1, int s1abs, int s1neg,
  4383. +                       value *s2, int s2abs, int s2neg);
  4384. +
  4385. +   int emit_alu(unsigned op, value *dst, int clamp,
  4386. +                       const alu_src& s0) {
  4387. +       return emit_alu(op, dst, clamp, s0.v, s0.abs, s0.neg, NULL, 0, 0,
  4388. +                        NULL, 0, 0);
  4389. +   }
  4390. +   int emit_alu(unsigned op, value *dst, int clamp,
  4391. +                       const alu_src& s0, const alu_src& s1) {
  4392. +       return emit_alu(op, dst, clamp, s0.v, s0.abs, s0.neg,
  4393. +                        s1.v, s1.abs, s1.neg, NULL, 0, 0);
  4394. +   }
  4395. +   int emit_alu(unsigned op, value *dst, int clamp,
  4396. +                       const alu_src& s0, const alu_src& s1,
  4397. +                       const alu_src& s2) {
  4398. +       return emit_alu(op, dst, clamp, s0.v, s0.abs, s0.neg,
  4399. +                        s1.v, s1.abs, s1.neg, s2.v, s2.abs, s2.neg);
  4400. +   }
  4401. +
  4402. +   void emit_node(node *n);
  4403. +   void begin_group();
  4404. +   void end_group();
  4405. +   vvec get_vector_values(value_kind kind, unsigned tgsi_index,
  4406. +                          unsigned *swz = NULL);
  4407. +
  4408. +   int ti_unsupported();
  4409. +   int ti_alu();
  4410. +   int ti_dot();
  4411. +   int ti_repl();
  4412. +   int ti_lit();
  4413. +   int ti_trig();
  4414. +   int ti_scs();
  4415. +   value* prepare_trig(alu_src s);
  4416. +
  4417. +   int ti_exp();
  4418. +   int ti_log();
  4419. +   int ti_dst();
  4420. +   int ti_lrp();
  4421. +   int ti_pow();
  4422. +   int ti_replicate(value *t);
  4423. +   int ti_xpd();
  4424. +   int ti_kill();
  4425. +   int ti_arl();
  4426. +   int ti_ssg();
  4427. +   int ti_cmp();
  4428. +   int ti_umad();
  4429. +   int ti_f2iu();
  4430. +   int ti_ineg();
  4431. +   int ti_iabs();
  4432. +   int ti_divmod();
  4433. +
  4434. +   int ti_tex();
  4435. +   int ti_buffer_txq();
  4436. +   int ti_vtx_fetch();
  4437. +
  4438. +   int ti_if();
  4439. +   int ti_else();
  4440. +   int ti_endif();
  4441. +
  4442. +   int ti_begin_loop();
  4443. +   int ti_loop_op();
  4444. +   int ti_end_loop();
  4445. +
  4446. +   alu_src asrc(value *v, int abs = 0, int neg = 0);
  4447. +   alu_src asrc(literal l, int abs = 0, int neg = 0);
  4448. +   alu_src asrc(float f, int abs = 0, int neg = 0);
  4449. +   alu_src asrc(uint32_t u, int abs = 0, int neg = 0);
  4450. +   alu_src asrc(tgsi_arg& ta, int chan);
  4451. +   alu_src asrc(tgsi_arg& ta, int chan, int abs, int neg);
  4452. +
  4453. +   value* create_temp(int chan = 0) { return sh->create_temp_value(chan); }
  4454. +   void create_temps(vvec &temps, int n) {
  4455. +       temps.resize(n);
  4456. +       for (int i = 0; i < n; ++i)
  4457. +           temps[i] = create_temp();
  4458. +   }
  4459. +
  4460. +   uint32_t get_immediate(sel_chan sc);
  4461. +
  4462. +   fetch_node* create_fetch(unsigned op);
  4463. +
  4464. +   int fetch_rel_const(tgsi_arg& ta);
  4465. +};
  4466. +
  4467. +} // namespace r600_sb
  4468. +
  4469. +#endif /* SB_TGSI_TRANSLATOR_H_ */
  4470. diff --git a/src/gallium/drivers/r600/sb/sb_valtable.cpp b/src/gallium/drivers/r600/sb/sb_valtable.cpp
  4471. index 00aee66..ad2e78b 100644
  4472. --- a/src/gallium/drivers/r600/sb/sb_valtable.cpp
  4473. +++ b/src/gallium/drivers/r600/sb/sb_valtable.cpp
  4474. @@ -61,13 +61,55 @@ sb_ostream& operator << (sb_ostream &o, value &v) {
  4475.     }
  4476.  
  4477.     case VLK_REG:
  4478. -       o << "R" << v.select.sel() << "."
  4479. +       if (v.rel) {
  4480. +           o << "AREG" << v.select;
  4481. +           o << "[";
  4482. +           o << *v.rel;
  4483. +           o << "]";
  4484. +           o << "_" << v.uid;
  4485. +       } else
  4486. +           o << "R" << v.select.sel() << "."
  4487. +               << chans[v.select.chan()];
  4488. +       break;
  4489. +   case VLK_TGSI_INPUT:
  4490. +       if (v.rel) {
  4491. +           o << "AIN" << v.select;
  4492. +           o << "[";
  4493. +           o << *v.rel;
  4494. +           o << "]";
  4495. +           o << "_" << v.uid;
  4496. +       } else
  4497. +           o << "IN" << v.select.sel() << "."
  4498. +               << chans[v.select.chan()];
  4499. +       break;
  4500. +   case VLK_TGSI_OUTPUT:
  4501. +       if (v.rel) {
  4502. +           o << "AOUT" << v.select;
  4503. +           o << "[";
  4504. +           o << *v.rel;
  4505. +           o << "]";
  4506. +           o << "_" << v.uid;
  4507. +       } else
  4508. +           o << "OUT" << v.select.sel() << "."
  4509. +               << chans[v.select.chan()];
  4510. +       break;
  4511. +   case VLK_TGSI_TEMP:
  4512. +       if (v.rel) {
  4513. +           o << "ATEMP" << v.select;
  4514. +           o << "[";
  4515. +           o << *v.rel;
  4516. +           o << "]";
  4517. +           o << "_" << v.uid;
  4518. +       } else
  4519. +           o << "TEMP" << v.select.sel() << "."
  4520. +               << chans[v.select.chan()];
  4521. +       break;
  4522. +   case VLK_TGSI_ADDR:
  4523. +       o << "ADDR" << v.select.sel() << "."
  4524.             << chans[v.select.chan()];
  4525. -
  4526.         break;
  4527. -   case VLK_KCACHE: {
  4528. +   case VLK_KCACHE:
  4529.         o << "C" << v.select.sel() << "." << chans[v.select.chan()];
  4530. -   }
  4531.         break;
  4532.     case VLK_CONST:
  4533.         o << v.literal_value.f << "|";
  4534. @@ -80,16 +122,6 @@ sb_ostream& operator << (sb_ostream &o, value &v) {
  4535.     case VLK_TEMP:
  4536.         o << "t" << v.select.sel() - shader::temp_regid_offset;
  4537.         break;
  4538. -   case VLK_REL_REG:
  4539. -
  4540. -       o << "A" << v.select;
  4541. -       o << "[";
  4542. -       o << *v.rel;
  4543. -       o << "]";
  4544. -
  4545. -       o << "_" << v.uid;
  4546. -
  4547. -       break;
  4548.     case VLK_UNDEF:
  4549.         o << "undef";
  4550.         break;
  4551. @@ -113,7 +145,7 @@ sb_ostream& operator << (sb_ostream &o, value &v) {
  4552.  
  4553.     sel_chan g;
  4554.  
  4555. -   if (v.is_rel()) {
  4556. +   if (v.array) {
  4557.         g = v.array->gpr;
  4558.     } else {
  4559.         g = v.gpr;
  4560. @@ -542,7 +574,8 @@ bool ra_constraint::check() {
  4561.     return true;
  4562.  }
  4563.  
  4564. -bool gpr_array::is_dead() {
  4565. +bool rel_array::is_dead() {
  4566. +   // XXX maybe do something here?
  4567.     return false;
  4568.  }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement