Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ******************************************************************************
- *
- * void D_DrawSpans16 (espan_t *pspan)
- *
- * standard scan drawing function (16 pixel subdivision)
- *
- ******************************************************************************
- cnop 0,4
- _D_DrawSpans16
- rs.l 1
- .pspan rs.l 1
- move.l .pspan(sp),a0
- @D_DrawSpans16
- ***** stackframe
- rsreset
- .saved4 rs.l 1
- .saved5 rs.l 1
- .savea6 rs.l 1
- .szstpu rs.s 1
- .szstpv rs.s 1
- .szorg rs.s 1
- .tzstpu rs.s 1
- .tzstpv rs.s 1
- .tzorg rs.s 1
- .zistpu rs.s 1
- .zistpv rs.s 1
- .ziorg rs.s 1
- .fpuregs rs.x 6
- .intregs rs.l 11
- rs.l 1
- .pspan rs.l 1
- ****** Prologue. Global variables are put into registers or onto the stackframe
- movem.l d2-d7/a2-a6,-(sp)
- fmovem.x fp2-fp7,-(sp)
- move.l _bbextentt,a2
- move.l _tadjust,a3
- move.l _bbextents,a4
- move.l _sadjust,a5
- move.l _d_ziorigin,-(sp)
- move.l _d_zistepv,-(sp)
- move.l _d_zistepu,-(sp)
- move.l _d_tdivzorigin,-(sp)
- move.l _d_tdivzstepv,-(sp)
- move.l _d_tdivzstepu,-(sp)
- move.l _d_sdivzorigin,-(sp)
- move.l _d_sdivzstepv,-(sp)
- move.l _d_sdivzstepu,-(sp)
- sub.l #.szstpu,sp
- ****** First loop. In every iteration one complete span is drawn
- * pbase = (unsigned char *)cacheblock;
- *
- * sdivz16stepu = d_sdivzstepu * 16;
- * tdivz16stepu = d_tdivzstepu * 16;
- * zi16stepu = d_zistepu * 16;
- *
- * do
- * {
- * pdest = (unsigned char *)((byte *)d_viewbuffer +
- * (screenwidth * pspan->v) + pspan->u);
- *
- * count = pspan->count;
- *
- * // calculate the initial s/z, t/z, 1/z, s, and t and clamp
- * du = (float)pspan->u;
- * dv = (float)pspan->v;
- *
- * sdivz = d_sdivzorigin + dv*d_sdivzstepv + du*d_sdivzstepu;
- * tdivz = d_tdivzorigin + dv*d_tdivzstepv + du*d_tdivzstepu;
- * zi = d_ziorigin + dv*d_zistepv + du*d_zistepu;
- * z = (float)0x10000 / zi; // prescale to 16.16 fixed-point
- *
- move.l _cacheblock,a1 ;pbase = (unsigned char *)cacheblock
- fmove.s #16,fp7
- fmove.s .szstpu(sp),fp3
- fmul fp7,fp3 ;sdivz16stepu = d_sdivzstepu * 16
- fmove.s .tzstpu(sp),fp4
- fmul fp7,fp4 ;tdivz16stepu = d_tdivzstepu * 16
- fmove.s .zistpu(sp),fp5
- fmul fp7,fp5 ;zi16stepu = d_zistepu * 16
- move.l a0,a6 ;get function parameter
- .loop
- move.l a6,.savea6(sp) ;save actual ptr to pspan
- move.l _d_viewbuffer,a0
- move.l _screenwidth,d0
- move.l (a6)+,d1
- fmove.l d1,fp2 ;du = (float)pspan->u
- move.l (a6)+,d2
- fmove.l d2,fp7 ;dv = (float)pspan->v
- move.l (a6)+,d4
- muls d2,d0 ;d0 = screenwidth * pspan->v
- add.l d1,d0
- add.l d0,a0 ;pdest = d_viewbuffer + pspan->u + d0
- lea .szstpu(sp),a6 ;a6 -> stackframe
- fmove.s (a6)+,fp0
- fmul fp2,fp0 ;fp0 = du * d_sdivzstepu
- fmove.s (a6)+,fp1
- fmul fp7,fp1 ;fp1 = dv * d_sdivzstepv
- fadd fp1,fp0
- fadd.s (a6)+,fp0 ;sdivz = d_sdivzorigin + fp0 + fp1
- fmove.s (a6)+,fp1
- fmul fp2,fp1 ;fp1 = du * d_tdivzstepu
- fmove.s (a6)+,fp6
- fmul fp7,fp6 ;fp6 = dv * d_tdivzstepv
- fadd fp6,fp1
- fadd.s (a6)+,fp1 ;tdivz = d_tdivzorigin + fp1 + fp6
- fmul.s (a6)+,fp2 ;fp2 = du * d_zistepu
- fmul.s (a6)+,fp7 ;fp7 = dv * d_zistepv
- fadd fp7,fp2
- fadd.s (a6)+,fp2 ;zi = d_ziorigin + fp2 + fp7
- fmove.s #65536,fp6
- fdiv fp2,fp6 ;z = (float)0x10000 / zi
- * s = (int)(sdivz * z) + sadjust;
- * if (s > bbextents)
- * s = bbextents;
- * else if (s < 0)
- * s = 0;
- *
- * t = (int)(tdivz * z) + tadjust;
- * if (t > bbextentt)
- * t = bbextentt;
- * else if (t < 0)
- * t = 0;
- fmove fp6,fp7
- fmul fp0,fp7 ;fp7 = sdivz * z
- fmove.l fp7,d6 ;convert to integer
- add.l a5,d6 ;s = d6 + sadjust
- cmp.l a4,d6 ;if (s > bbextents)
- bgt.b .down
- tst.l d6 ;if (s < 0)
- bge.b .keep
- .up
- moveq #0,d6 ;s = 0
- bra.b .keep
- .down
- move.l a4,d6 ;s = bbextents
- .keep
- fmul fp1,fp6 ;fp6 = tdivz * z
- fmove.l fp6,d7 ;convert to integer
- add.l a3,d7 ;t = d7 + tadjust
- cmp.l a2,d7 ;if (t > bbextentt)
- bgt.b .down2
- tst.l d7 ;if (t < 0)
- bge.b .keep2
- .up2
- moveq #0,d7 ;t = 0
- bra.b .keep2
- .down2
- move.l a2,d7 ;t = bbextentt
- .keep2
- move.l d4,d1
- ****** Second loop. In every iteration one part of the whole span is drawn
- ****** d2 gets the value (spancount-1)! [NOT spancount]
- ****** d1 = count
- * do
- * {
- * // calculate s and t at the far end of the span
- * if (count >= 16)
- * spancount = 16;
- * else
- * spancount = count;
- *
- * count -= spancount;
- *
- * if (count)
- * {
- .loop2
- moveq #16-1,d2 ;spancount = 16
- cmp.l #16,d1 ;if (count >= 16)
- bgt.b .cont
- move.l d1,d2 ;spancount = count
- subq.l #1,d2
- moveq #0,d1 ;count -= spancount
- bra.w .finalpart
- .cont
- sub.l #16,d1 ;count -= spancount;
- ****** Evaluation of the values for the inner loop. This version is used for
- ****** span size = 16
- ****** a2 : bbextentt
- ****** a3 : tadjust
- ****** a4 : bbextents
- ****** a5 : sadjust
- ****** fp0 : sdivz
- ****** fp1 : tdivz
- ****** fp2 : zi
- ****** fp3 : sdivz16stepu
- ****** fp4 : tdivz16stepu
- ****** fp5 : zi16stepu
- * // calculate s/z, t/z, zi->fixed s and t at far end of span,
- * // calculate s and t steps across span by shifting
- * sdivz += sdivz16stepu;
- * tdivz += tdivz16stepu;
- * zi += zi16stepu;
- * z = (float)0x10000 / zi; // prescale to 16.16 fixed-point
- * snext = (int)(sdivz * z) + sadjust;
- * if (snext > bbextents)
- * snext = bbextents;
- * else if (snext < 16)
- * snext = 16; // prevent round-off error on <0 steps from
- * // from causing overstepping & running off the
- * // edge of the texture
- * tnext = (int)(tdivz * z) + tadjust;
- * if (tnext > bbextentt)
- * tnext = bbextentt;
- * else if (tnext < 16)
- * tnext = 16; // guard against round-off error on <0 steps
- * sstep = (snext - s) >> 4;
- * tstep = (tnext - t) >> 4;
- * }
- fadd fp3,fp0 ;sdivz += sdivz16stepu
- fadd fp4,fp1 ;tdivz += tdivz16stepu
- fadd fp5,fp2 ;zi += zi16stepu
- fmove.s #65536,fp7
- fdiv fp2,fp7 ;z = (float)0x10000 / zi;
- fmove fp7,fp6
- fmul fp0,fp6 ;fp2 = sdivz * z
- fmove.l fp6,d4 ;convert to integer
- add.l a5,d4 ;snext = d4 + sadjut
- cmp.l a4,d4 ;if (snext > bbextents)
- bgt.b .down3
- cmp.l #16,d4 ;if (snext < 16)
- bge.b .keep3
- .up3
- moveq #16,d4 ;snext = 16
- bra.b .keep3
- .down3
- move.l a4,d4 ;snext = bbextents
- .keep3
- fmul fp1,fp7 ;fp7 = tdivz * z
- fmove.l fp7,d5 ;convert to integer
- add.l a3,d5 ;tnext = d5 + tadjust
- cmp.l a2,d5 ;if (tnext > bbextentt)
- bgt.b .down4
- cmp.l #16,d5 ;if (tnext < 16)
- bge.b .keep4
- .up4
- moveq #16,d5 ;tnext = 16
- bra.b .keep4
- .down4
- move.l a2,d5 ;tnext = bbextentt
- .keep4
- move.l d4,.saved4(sp) ;save snext
- move.l d5,.saved5(sp) ;save tnext
- sub.l d6,d4 ;d4 = snext - s
- sub.l d7,d5 ;d5 = tnext - t
- asr.l #4,d4 ;sstep = d4 >> 4
- asr.l #4,d5 ;tstep = d5 >> 4
- bra.w .mainloop
- ****** Evaluation of the values for the inner loop. This version is used for
- ****** span size < 16
- ****** The original algorithm has two ugly divisions at the end of this part.
- ****** These are removed by the following optimization:
- ****** First, the divisors 1,2 and 4 are handled specially to gain speed. The
- ****** other divisors are handled using a reciprocal table.
- ****** a2 : bbextentt
- ****** a3 : tadjust
- ****** a4 : bbextents
- ****** a5 : sadjust
- ****** fp0 : sdivz
- ****** fp1 : tdivz
- ****** fp2 : zi
- * // calculate s/z, t/z, zi->fixed s and t at last pixel in span (so
- * // can't step off polygon), clamp, calculate s and t steps across
- * // span by division, biasing steps low so we don't run off the
- * // texture
- * spancountminus1 = (float)(spancount - 1);
- * sdivz += d_sdivzstepu * spancountminus1;
- * tdivz += d_tdivzstepu * spancountminus1;
- * zi += d_zistepu * spancountminus1;
- * z = (float)0x10000 / zi; // prescale to 16.16 fixed-point
- * snext = (int)(sdivz * z) + sadjust;
- * if (snext > bbextents)
- * snext = bbextents;
- * else if (snext < 16)
- * snext = 16; // prevent round-off error on <0 steps from
- * // from causing overstepping & running off the
- * // edge of the texture
- *
- * tnext = (int)(tdivz * z) + tadjust;
- * if (tnext > bbextentt)
- * tnext = bbextentt;
- * else if (tnext < 16)
- * tnext = 16; // guard against round-off error on <0 steps
- *
- * if (spancount > 1)
- * {
- * sstep = (snext - s) / (spancount - 1);
- * tstep = (tnext - t) / (spancount - 1);
- * }
- * }
- .finalpart
- fmove.l d2,fp7 ;spancountminus1 = (float)(spancount-1)
- fmove fp7,fp6
- fmul.s .szstpu(sp),fp6 ;fp6 = d_sdivzstepu * spancountminus1
- fadd fp6,fp0 ;sdivz += fp6
- fmove fp7,fp6
- fmul.s .tzstpu(sp),fp6 ;fp6 = d_tdivzstepu * spancountminus1
- fadd fp6,fp1 ;tdivz += fp6
- fmul.s .zistpu(sp),fp7 ;fp7 = d_zistepu * spancountminus1
- fadd fp7,fp2 ;zi += fp7
- fmove.s #65536,fp7
- fdiv fp2,fp7 ;z = (float)0x10000 / zi;
- fmove fp7,fp6
- fmul fp0,fp6 ;fp6 = sdivz * z
- fmove.l fp6,d4 ;convert to integer
- add.l a5,d4 ;snext = d4 + sadjust
- cmp.l a4,d4 ;if (snext > bbextents)
- bgt.b .down5
- cmp.l #16,d4 ;if (snext < 16)
- bge.b .keep5
- .up5
- moveq #16,d4 ;snext = 16
- bra.b .keep5
- .down5
- move.l a4,d4 ;snext = bbextents
- .keep5
- fmul fp1,fp7 ;fp7 = tdivz * z
- fmove.l fp7,d5 ;convert to integer
- add.l a3,d5 ;tnext = d5 + tadjust
- cmp.l a2,d5 ;if (tnext > bbextentt)
- bgt.b .down6
- cmp.l #16,d5 ;if (tnext < 16)
- bge.b .keep6
- .up6
- moveq #16,d5 ;tnext = 16
- bra.b .keep6
- .down6
- move.l a2,d5 ;tnext = bbextentt
- .keep6
- move.l d4,.saved4(sp) ;save snext
- move.l d5,.saved5(sp) ;save tnext
- sub.l d6,d4 ;d4 = snext - s
- sub.l d7,d5 ;d5 = tnext - t
- IFEQ QDIV
- tst.l d2
- beq.w .mainloop
- divs.l d2,d4
- divs.l d2,d5
- ELSEIF
- cmp #5,d2 ;(spancount-1) < 5?
- blt.b .special ;yes -> special case
- cmp #8,d2
- beq.b .spec_8
- .qdiv
- IFNE NICE_DIV
- lsl.l #2,d4
- lsl.l #2,d5
- lea ReciprocTable,a6
- move 0(a6,d2.w*2),d0
- move.l d4,d3
- mulu d0,d3
- clr d3
- swap d3
- swap d4
- muls d0,d4
- add.l d3,d4
- move.l d5,d3
- mulu d0,d3
- clr d3
- swap d3
- swap d5
- muls d0,d5
- add.l d3,d5
- bra.b .mainloop
- ELSEIF
- asr.l #7,d4 ;d4 >> 7
- asr.l #7,d5 ;d5 >> 7
- lea ReciprocTable,a6 ;a6 -> reciprocal table
- move 0(a6,d2.w*2),d0 ;d0 = (1/(spancount-1))<<16
- muls d0,d4 ;d4 = d4 / (spancount-1)
- asr.l #7,d4 ;sstep = d4 >> 7
- muls d0,d5 ;d5 = d5 / (spancount-1)
- asr.l #7,d5 ;tstep = d5 >> 7
- bra.b .mainloop
- ENDC
- .special
- cmp #1,d2 ;switch (spancount-1)
- ble.b .mainloop ;0,1 -> no scaling needed
- cmp #3,d2 ;3 -> standard qdiv
- beq.b .qdiv
- blt.b .spec_2
- asr.l #2,d4 ;4 -> scale by shifting right
- asr.l #2,d5
- bra.b .mainloop
- .spec_8
- asr.l #3,d4 ;8 -> scale by shifting right
- asr.l #3,d5
- bra.b .mainloop
- .spec_2
- asr.l #1,d4 ;2 -> scale by shifting right
- asr.l #1,d5
- ENDC
- ****** Main drawing loop. Here lies the speed.
- ****** Very optimized (removed multiplication from inner loop)
- ****** d2 : spancount
- ****** d4 : sstep
- ****** d5 : tstep
- ****** d6 : s
- ****** d7 : t
- ****** a0 : pdest
- ****** a1 : pbase
- * do
- * {
- * *pdest++ = *(pbase + (s >> 16) + (t >> 16) * cachewidth);
- * s += sstep;
- * t += tstep;
- * } while (--spancount > 0);
- .mainloop
- move.l d1,-(sp)
- lea .PixTable,a6 ;a6 -> Functable
- move.l _cachewidth,d3 ;read cachewidth
- move.l 0(a6,d2.w*4),a6 ;get pointer to function
- swap d7
- swap d4
- move.l d7,d1
- swap d5
- muls d3,d7 ;d7 = t integer part * cachewidth
- move d5,d2
- clr d1 ;d1 = t fractional part
- muls d3,d2 ;tstep integer part * cachewidth
- move d4,d0 ;d0 = sstep integer part
- clr d5 ;d5 = tstep fractional part
- clr d4 ;d4 = sstep fractional part
- swap d6 ;d6 = s swapped
- jmp (a6)
- .Pix16
- lea 0(a1,d6.w),a6
- move.b 0(a6,d7.l),(a0)+
- add.l d4,d6 ;increment s fractional part
- addx.w d0,d6 ;increment s integer part
- add.l d2,d7 ;increment t integer part
- add.l d5,d1 ;increment t fractional part
- bcc.b .Pix15 ;check if carry
- add.l d3,d7 ;add cachewidth to t
- .Pix15
- lea 0(a1,d6.w),a6 ;and so long...
- move.b 0(a6,d7.l),(a0)+
- add.l d4,d6
- addx.w d0,d6
- add.l d2,d7
- add.l d5,d1
- bcc.b .Pix14
- add.l d3,d7
- .Pix14
- lea 0(a1,d6.w),a6
- move.b 0(a6,d7.l),(a0)+
- add.l d4,d6
- addx.w d0,d6
- add.l d2,d7
- add.l d5,d1
- bcc.b .Pix13
- add.l d3,d7
- .Pix13
- lea 0(a1,d6.w),a6
- move.b 0(a6,d7.l),(a0)+
- add.l d4,d6
- addx.w d0,d6
- add.l d2,d7
- add.l d5,d1
- bcc.b .Pix12
- add.l d3,d7
- .Pix12
- lea 0(a1,d6.w),a6
- move.b 0(a6,d7.l),(a0)+
- add.l d4,d6
- addx.w d0,d6
- add.l d2,d7
- add.l d5,d1
- bcc.b .Pix11
- add.l d3,d7
- .Pix11
- lea 0(a1,d6.w),a6
- move.b 0(a6,d7.l),(a0)+
- add.l d4,d6
- addx.w d0,d6
- add.l d2,d7
- add.l d5,d1
- bcc.b .Pix10
- add.l d3,d7
- .Pix10
- lea 0(a1,d6.w),a6
- move.b 0(a6,d7.l),(a0)+
- add.l d4,d6
- addx.w d0,d6
- add.l d2,d7
- add.l d5,d1
- bcc.b .Pix9
- add.l d3,d7
- .Pix9
- lea 0(a1,d6.w),a6
- move.b 0(a6,d7.l),(a0)+
- add.l d4,d6
- addx.w d0,d6
- add.l d2,d7
- add.l d5,d1
- bcc.b .Pix8
- add.l d3,d7
- .Pix8
- lea 0(a1,d6.w),a6
- move.b 0(a6,d7.l),(a0)+
- add.l d4,d6
- addx.w d0,d6
- add.l d2,d7
- add.l d5,d1
- bcc.b .Pix7
- add.l d3,d7
- .Pix7
- lea 0(a1,d6.w),a6
- move.b 0(a6,d7.l),(a0)+
- add.l d4,d6
- addx.w d0,d6
- add.l d2,d7
- add.l d5,d1
- bcc.b .Pix6
- add.l d3,d7
- .Pix6
- lea 0(a1,d6.w),a6
- move.b 0(a6,d7.l),(a0)+
- add.l d4,d6
- addx.w d0,d6
- add.l d2,d7
- add.l d5,d1
- bcc.b .Pix5
- add.l d3,d7
- .Pix5
- lea 0(a1,d6.w),a6
- move.b 0(a6,d7.l),(a0)+
- add.l d4,d6
- addx.w d0,d6
- add.l d2,d7
- add.l d5,d1
- bcc.b .Pix4
- add.l d3,d7
- .Pix4
- lea 0(a1,d6.w),a6
- move.b 0(a6,d7.l),(a0)+
- add.l d4,d6
- addx.w d0,d6
- add.l d2,d7
- add.l d5,d1
- bcc.b .Pix3
- add.l d3,d7
- .Pix3
- lea 0(a1,d6.w),a6
- move.b 0(a6,d7.l),(a0)+
- add.l d4,d6
- addx.w d0,d6
- add.l d2,d7
- add.l d5,d1
- bcc.b .Pix2
- add.l d3,d7
- .Pix2
- lea 0(a1,d6.w),a6
- move.b 0(a6,d7.l),(a0)+
- add.l d4,d6
- addx.w d0,d6
- add.l d2,d7
- add.l d5,d1
- bcc.b .Pix1
- add.l d3,d7
- .Pix1
- lea 0(a1,d6.w),a6
- move.b 0(a6,d7.l),(a0)+
- add.l d4,d6
- addx.w d0,d6
- add.l d2,d7
- add.l d5,d1
- bcc.b .Pix0
- add.l d3,d7
- .Pix0
- ****** loop terminations
- move.l .saved5+4(sp),d7 ;t = tnext
- move.l .saved4+4(sp),d6 ;s = snext
- move.l (sp)+,d1
- * tst.l d1 ;while (count > 0)
- bgt.w .loop2
- move.l .savea6(sp),a6 ;while ((pspan = pspan->next) != NULL)
- move.l SPAN_PNEXT(a6),a6
- tst.l a6
- bne.w .loop
- add.l #.fpuregs,sp
- fmovem.x (sp)+,fp2-fp7
- movem.l (sp)+,d2-d7/a2-a6
- rts
- .PixTable
- dc.l .Pix1
- dc.l .Pix2
- dc.l .Pix3
- dc.l .Pix4
- dc.l .Pix5
- dc.l .Pix6
- dc.l .Pix7
- dc.l .Pix8
- dc.l .Pix9
- dc.l .Pix10
- dc.l .Pix11
- dc.l .Pix12
- dc.l .Pix13
- dc.l .Pix14
- dc.l .Pix15
- dc.l .Pix16
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement