zmatt

pru near vs far.cc

Dec 21st, 2021 (edited)
70
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 5.09 KB | None | 0 0
  1. #include <stdint.h>
  2.  
  3. typedef uint32_t uint32_t;
  4.  
  5. // near = within first 64KB of memory
  6. // far  = not within first 64KB of memory, placed in "far" sections
  7. // If not explicitly specified, the default is given via --mem_model:data (which defaults to near).
  8. //
  9. // __attribute__(( cregister("foo", near) )) = within 256 bytes of cregister "foo"
  10. // __attribute__(( cregister("foo", far) ))  = within 64KB of cregister "foo"
  11. // If neither "near" nor "far" specified, the default is "near".
  12.  
  13. far uint32_t far0[] = { 0 };
  14. far uint32_t far1[] __attribute__(( cregister("SHARED", far) )) = { 0 };
  15. far uint32_t far2[] __attribute__(( cregister("SHARED", near) )) = { 0 };
  16. near uint32_t near0[] = { 0 };
  17. near uint32_t near1[] __attribute__(( cregister("LOCAL", far) )) = { 0 };
  18. near uint32_t near2[] __attribute__(( cregister("LOCAL", near) )) = { 0 };
  19.  
  20. // cregister has no impact when clpru is forced to produce a pointer:
  21.  
  22. uint32_t *far0_ptr_0() {  return &far0[0];  }    // ldi32  (2 cycles)
  23. uint32_t *far1_ptr_0() {  return &far1[0];  }    // ldi32  (2 cycles)
  24. uint32_t *far2_ptr_0() {  return &far2[0];  }    // ldi32  (2 cycles)
  25. uint32_t *near0_ptr_0() {  return &near0[0];  }  // ldi    (1 cycle)
  26. uint32_t *near1_ptr_0() {  return &near1[0];  }  // ldi    (1 cycle)
  27. uint32_t *near2_ptr_0() {  return &near2[0];  }  // ldi    (1 cycle)
  28.  
  29. uint32_t *far0_ptr_i( uint32_t i ) {  return &far0[i];  }    // ldi32 lsl add  (4 cycles)
  30. uint32_t *far1_ptr_i( uint32_t i ) {  return &far1[i];  }    // ldi32 lsl add  (4 cycles)
  31. uint32_t *far2_ptr_i( uint32_t i ) {  return &far2[i];  }    // ldi32 lsl add  (4 cycles)
  32. uint32_t *near0_ptr_i( uint32_t i ) {  return &near0[i];  }  // ldi lsl add    (3 cycles)
  33. uint32_t *near1_ptr_i( uint32_t i ) {  return &near1[i];  }  // ldi lsl add    (3 cycles)
  34. uint32_t *near2_ptr_i( uint32_t i ) {  return &near2[i];  }  // ldi lsl add    (3 cycles)
  35.  
  36. // but it can be used to optimize direct access, especially cregister(near):
  37. // (note that when creg is used (lbco), it no longer matter if the variable is in near or far memory)
  38.  
  39. uint32_t far0_get_0() {  return far0[0];  }    // ldi32 lbbo  (5 cycles)
  40. uint32_t far1_get_0() {  return far1[0];  }    // ldi lbco    (4 cycles)
  41. uint32_t far2_get_0() {  return far2[0];  }    // lbco        (3 cycles)
  42. uint32_t near0_get_0() {  return near0[0];  }  // ldi lbbo    (4 cycles)
  43. uint32_t near1_get_0() {  return near1[0];  }  // ldi lbco    (4 cycles)
  44. uint32_t near2_get_0() {  return near2[0];  }  // lbco        (3 cycles)
  45.  
  46. // cregister(far) is of no use for optimizing variable-offset access:
  47.  
  48. uint32_t far0_get_i( uint32_t i ) {  return far0[i];  }    // lsl ldi32 lbbo  (6 cycles)
  49. uint32_t far1_get_i( uint32_t i ) {  return far1[i];  }    // lsl ldi32 lbbo  (6 cycles)
  50. uint32_t far2_get_i( uint32_t i ) {  return far2[i];  }    // lsl add lbco    (5 cycles)
  51. uint32_t near0_get_i( uint32_t i ) {  return near0[i];  }  // lsl ldi lbbo    (5 cycles)
  52. uint32_t near1_get_i( uint32_t i ) {  return near1[i];  }  // lsl ldi lbbo    (5 cycles)
  53. uint32_t near2_get_i( uint32_t i ) {  return near2[i];  }  // lsl add lbco    (5 cycles)
  54.  
  55. // cregister(far) is also not worth exploiting when multiple accesses are done:
  56.  
  57. uint32_t far0_add_0_1() {  return far0[0] + far0[1];  }     // ldi32 lbbo lbbo add  (9 cycles)
  58. uint32_t far1_add_0_1() {  return far1[0] + far1[1];  }     // ldi32 lbbo lbbo add  (9 cycles)
  59. uint32_t far2_add_0_1() {  return far2[0] + far2[1];  }     // lbco lbco add        (7 cycles)
  60. uint32_t near0_add_0_1() {  return near0[0] + near0[1];  }  // ldi lbbo lbbo add    (8 cycles)
  61. uint32_t near1_add_0_1() {  return near1[0] + near1[1];  }  // ldi lbbo lbbo add    (8 cycles)
  62. uint32_t near2_add_0_1() {  return near2[0] + near2[1];  }  // lbco lbco add        (7 cycles)
  63.  
  64. // with enough indexing complexity, even cregister(near) no longer helps:
  65.  
  66. uint32_t far0_add_i_j( uint32_t i, uint32_t j ) {  return far0[i] + far0[j];  }     // ldi32 lsl lsl lbbo lbbo add  (11 cycles)
  67. uint32_t far1_add_i_j( uint32_t i, uint32_t j ) {  return far1[i] + far1[j];  }     // ldi32 lsl lsl lbbo lbbo add  (11 cycles)
  68. uint32_t far2_add_i_j( uint32_t i, uint32_t j ) {  return far2[i] + far2[j];  }     // ldi32 lsl lsl lbbo lbbo add  (11 cycles)
  69. uint32_t near0_add_i_j( uint32_t i, uint32_t j ) {  return near0[i] + near0[j];  }  // ldi lsl lsl lbbo lbbo add    (10 cycles)
  70. uint32_t near1_add_i_j( uint32_t i, uint32_t j ) {  return near1[i] + near1[j];  }  // ldi lsl lsl lbbo lbbo add    (10 cycles)
  71. uint32_t near2_add_i_j( uint32_t i, uint32_t j ) {  return near2[i] + near2[j];  }  // ldi lsl lsl lbbo lbbo add    (10 cycles)
  72.  
  73. // cycle-count summary:
  74. //      far0    far1    far2    near0   near1   near2
  75. //      2       2       2       1       1       1       ptr_0()
  76. //      4       4       4       3       3       3       ptr_i()
  77. //      5       4       3       4       4       3       get_0()
  78. //      6       6       5       5       5       5       get_i()
  79. //      9       9       7       8       8       7       add_0_1()
  80. //      11      11      11      10      10      10      add_i_j()
  81. //
  82. // (note: these don't include the function's return instruction (1 cycle))
Advertisement
Add Comment
Please, Sign In to add comment