Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include <stdint.h>
- typedef uint32_t uint32_t;
- // near = within first 64KB of memory
- // far = not within first 64KB of memory, placed in "far" sections
- // If not explicitly specified, the default is given via --mem_model:data (which defaults to near).
- //
- // __attribute__(( cregister("foo", near) )) = within 256 bytes of cregister "foo"
- // __attribute__(( cregister("foo", far) )) = within 64KB of cregister "foo"
- // If neither "near" nor "far" specified, the default is "near".
- far uint32_t far0[] = { 0 };
- far uint32_t far1[] __attribute__(( cregister("SHARED", far) )) = { 0 };
- far uint32_t far2[] __attribute__(( cregister("SHARED", near) )) = { 0 };
- near uint32_t near0[] = { 0 };
- near uint32_t near1[] __attribute__(( cregister("LOCAL", far) )) = { 0 };
- near uint32_t near2[] __attribute__(( cregister("LOCAL", near) )) = { 0 };
- // cregister has no impact when clpru is forced to produce a pointer:
- uint32_t *far0_ptr_0() { return &far0[0]; } // ldi32 (2 cycles)
- uint32_t *far1_ptr_0() { return &far1[0]; } // ldi32 (2 cycles)
- uint32_t *far2_ptr_0() { return &far2[0]; } // ldi32 (2 cycles)
- uint32_t *near0_ptr_0() { return &near0[0]; } // ldi (1 cycle)
- uint32_t *near1_ptr_0() { return &near1[0]; } // ldi (1 cycle)
- uint32_t *near2_ptr_0() { return &near2[0]; } // ldi (1 cycle)
- uint32_t *far0_ptr_i( uint32_t i ) { return &far0[i]; } // ldi32 lsl add (4 cycles)
- uint32_t *far1_ptr_i( uint32_t i ) { return &far1[i]; } // ldi32 lsl add (4 cycles)
- uint32_t *far2_ptr_i( uint32_t i ) { return &far2[i]; } // ldi32 lsl add (4 cycles)
- uint32_t *near0_ptr_i( uint32_t i ) { return &near0[i]; } // ldi lsl add (3 cycles)
- uint32_t *near1_ptr_i( uint32_t i ) { return &near1[i]; } // ldi lsl add (3 cycles)
- uint32_t *near2_ptr_i( uint32_t i ) { return &near2[i]; } // ldi lsl add (3 cycles)
- // but it can be used to optimize direct access, especially cregister(near):
- // (note that when creg is used (lbco), it no longer matter if the variable is in near or far memory)
- uint32_t far0_get_0() { return far0[0]; } // ldi32 lbbo (5 cycles)
- uint32_t far1_get_0() { return far1[0]; } // ldi lbco (4 cycles)
- uint32_t far2_get_0() { return far2[0]; } // lbco (3 cycles)
- uint32_t near0_get_0() { return near0[0]; } // ldi lbbo (4 cycles)
- uint32_t near1_get_0() { return near1[0]; } // ldi lbco (4 cycles)
- uint32_t near2_get_0() { return near2[0]; } // lbco (3 cycles)
- // cregister(far) is of no use for optimizing variable-offset access:
- uint32_t far0_get_i( uint32_t i ) { return far0[i]; } // lsl ldi32 lbbo (6 cycles)
- uint32_t far1_get_i( uint32_t i ) { return far1[i]; } // lsl ldi32 lbbo (6 cycles)
- uint32_t far2_get_i( uint32_t i ) { return far2[i]; } // lsl add lbco (5 cycles)
- uint32_t near0_get_i( uint32_t i ) { return near0[i]; } // lsl ldi lbbo (5 cycles)
- uint32_t near1_get_i( uint32_t i ) { return near1[i]; } // lsl ldi lbbo (5 cycles)
- uint32_t near2_get_i( uint32_t i ) { return near2[i]; } // lsl add lbco (5 cycles)
- // cregister(far) is also not worth exploiting when multiple accesses are done:
- uint32_t far0_add_0_1() { return far0[0] + far0[1]; } // ldi32 lbbo lbbo add (9 cycles)
- uint32_t far1_add_0_1() { return far1[0] + far1[1]; } // ldi32 lbbo lbbo add (9 cycles)
- uint32_t far2_add_0_1() { return far2[0] + far2[1]; } // lbco lbco add (7 cycles)
- uint32_t near0_add_0_1() { return near0[0] + near0[1]; } // ldi lbbo lbbo add (8 cycles)
- uint32_t near1_add_0_1() { return near1[0] + near1[1]; } // ldi lbbo lbbo add (8 cycles)
- uint32_t near2_add_0_1() { return near2[0] + near2[1]; } // lbco lbco add (7 cycles)
- // with enough indexing complexity, even cregister(near) no longer helps:
- uint32_t far0_add_i_j( uint32_t i, uint32_t j ) { return far0[i] + far0[j]; } // ldi32 lsl lsl lbbo lbbo add (11 cycles)
- uint32_t far1_add_i_j( uint32_t i, uint32_t j ) { return far1[i] + far1[j]; } // ldi32 lsl lsl lbbo lbbo add (11 cycles)
- uint32_t far2_add_i_j( uint32_t i, uint32_t j ) { return far2[i] + far2[j]; } // ldi32 lsl lsl lbbo lbbo add (11 cycles)
- uint32_t near0_add_i_j( uint32_t i, uint32_t j ) { return near0[i] + near0[j]; } // ldi lsl lsl lbbo lbbo add (10 cycles)
- uint32_t near1_add_i_j( uint32_t i, uint32_t j ) { return near1[i] + near1[j]; } // ldi lsl lsl lbbo lbbo add (10 cycles)
- uint32_t near2_add_i_j( uint32_t i, uint32_t j ) { return near2[i] + near2[j]; } // ldi lsl lsl lbbo lbbo add (10 cycles)
- // cycle-count summary:
- // far0 far1 far2 near0 near1 near2
- // 2 2 2 1 1 1 ptr_0()
- // 4 4 4 3 3 3 ptr_i()
- // 5 4 3 4 4 3 get_0()
- // 6 6 5 5 5 5 get_i()
- // 9 9 7 8 8 7 add_0_1()
- // 11 11 11 10 10 10 add_i_j()
- //
- // (note: these don't include the function's return instruction (1 cycle))
Advertisement
Add Comment
Please, Sign In to add comment