[TUHS] A Reiser tour do force

Sun Apr 3 22:26:07 AEST 2022

Ah, yes, that's the one.

-rob

On Sun, Apr 3, 2022 at 9:03 PM Paul Ruizendaal via TUHS
<tuhs at minnie.tuhs.org> wrote:
>
> A not-very-thorough search at tuhs turned up V9/jerq/src/lib/j/bitblt.c
> It appears to be a pre-Reiser bitblt, not what was asked for.
>
>
> The Reiser code is in the V8 jerq tarball that Dan Cross donated:
> v8jerq.tar.bz2
>
> It is in file blit/src/libj/bitblt.s (attached below for convenience). It is 750 lines of 68K assembler. It does not appear to have been ported to the Bellmac 32 CPU. Maybe it did not make sense in that context.
>
> Paul
>
> =====
>
> #
> #  bitblt(sm,r,dm,p,fc)
> #  Bitmap *sm,*dm;
> #  Rectangle r;
> #  Point p;
> #  int fc;
> #
> #  by John F. Reiser  summer 1982
> #
> #  Depending on the case at hand, generate very good code and execute it.
> #
>
> # offsets in a Point
> set x,0
> set y,2
> # offsets in a Rectangle
> set origin,0
> set corner,4
> # offsets in a Bitmap
> set base,0
> set width,4
> set rect,6
> # parameter offsets from %fp
> set sm,8
> set r,12
> set dm,20
> set p,24
> set fc,28
>
> set NREG,11
>
> global bitblt
> bitblt:
> movm.l &0x3f3e,-(%sp) # save C registers
> movm.l NREG*4-4+sm(%sp),&0x001f
> # d1=r.o.x,,r.o.y; d2=r.c.x,,r.c.y; d4=p.x,,p.y;
> mov.l %d0,%a4 # sm
> mov.l %d3,%a5 # dm
> mov.w NREG*4-4+fc(%sp),%a6 # a6.w == fc
> movm.l rect(%a4),&0x9 # d0=sm.o.x,,sm.o.y; d3=sm.c.x,,sm.c.y;
> movm.l rect(%a5),&0x60 # d5=dm.o.x,,dm.o.y; d6=dm.c.x,,dm.c.y;
>
> lea.l $L50(%pc),%a0
> L5:
> # clip r.y to sm.y
> mov.w %d0,%d7 # sm.o.y
> sub.w %d1,%d7 # - r.o.y
> ble.b L10
> mov.w %d0,%d1 # r.o.y = sm.o.y; /* r.o.y was above sm.rect */
> add.w %d7,%d4 # p.y parallels r.o.y
> L10:
> cmp.w %d2,%d3 # r.c.y : sm.c.y
> ble.b L20
> mov.w %d3,%d2 # r.c.y = sm.c.y; /* bottom of r was below sm.rect */
> L20:
> # clip (r.y at p.y) to dm.y
> mov.w %d5,%d7 # dm.o.y
> sub.w %d4,%d7 # -p.y
> ble.b L30
> mov.w %d5,%d4 # p.y = dm.o.y; /* p.y was above dm.rect */
> add.w %d7,%d1 # r.o.y parallels p.y
> L30:
> mov.w %d1,%d7 # r.o.y
> add.w %d6,%d7 # + dm.c.y
> sub.w %d4,%d7 # - p.y  /* == max y that dm.rect allows in r */
> cmp.w %d2,%d7 # r.c.y : limit
> ble.b L40
> mov.w %d7,%d2 # r.c.y = limit
> L40:
> mov.w %d2,%d7 # r.c.y
> sub.w %d1,%d7 # - r.o.y
> sub.w &1,%d7 # /* == h-1  in bits */
> blt.b ret
> jmp (%a0)
>
> retgen:
> lea.l gensiz(%sp),%sp
> ret8:
> add.l &8,%sp
> ret:
> movm.l (%sp)+,&0x7cfc
> rts
>
> L50:
> # mirror in pi/4 and reuse same code to clip x
> swap.w %d0; swap.w %d1; swap.w %d2; swap.w %d3
> swap.w %d4; swap.w %d5; swap.w %d6; swap.w %d7
> lea.l $L55(%pc),%a0
> br.b L5
>
> L55:
> mov.l %d1,%a1
> mov.l %d4,%d6
> #
> #  So far
> # %d7 == h-1,,w-1
> # %d6 == p.y,,p.x
> # %a6.w == fc
> # %a5 == dm
> # %a4 == sm
> # %a1 == r.o.y,,r.o.x
> #
> #  Compute masks, and width in words
> #
> mov.w %d6,%d0 # p.x  /* left endpoint of dst */
> mov.w %d7,%d1 # w-1
> add.w %d6,%d1 # right endpoint
>
> mov.l &-1,%d3
> mov.l &15,%d2
> and.w %d0,%d2
> lsr.w %d2,%d3 # mask1
> mov.l &-1,%d5
> mov.l &15,%d2
> and.w %d1,%d2
> add.w &1,%d2
> lsr.w %d2,%d5
> not.w %d5 # mask2
> swap.w %d5
> mov.w %d3,%d5 # mask2,,mask1
>
> asr.w &4,%d0
> asr.w &4,%d1
> sub.w %d0,%d1
> sub.w &1,%d1 # inner-loop width in words
>
> mov.l &0,%d4 # assume LtoR
> mov.w width(%a5),%d3
> add.w %d3,%d3
> mov.w width(%a4),%d2
> add.w %d2,%d2
> #
> #  So far
> # %d7 == h-1,,w-1  in bits
> # %d6 == p.y,,p.x
> # %d5 == mask2,,mask1
> # %d4 == 0  (LtoR)
> # %d3.w == dm width in bytes
> # %d2.w == sm width in bytes
> # %d1.w == inner-loop width in words
> # %a6.w == fc
> # %a5 == dm
> # %a4 == sm
> # %a1 == r.o.y,,r.o.x
> #
> #  If necessary, compensate for overlap of source and destination
> #
> cmp.l %a4,%a5
> bne.b L80 # overlap not possible
> mov.l %d6,%d0 # p.y,,p.x
> mov.w %a1,%d0 # p.y,,r.o.x
> cmp.l %a1,%d0 # r.o.y : p.y
> bge.b L60 # if (r.o.y < p.y)
> mov.l %d7,%d0 # h-1,,w-1
> clr.w %d0 # h-1,,0
> add.l %d0,%a1 # r.o.y += h-1;
> add.l %d0,%d6 # p.y += h-1;
> neg.w %d3 # wdst = -wdst;
> neg.w %d2 # wsrc = -wsrc;
> L60:
> cmp.w %d7,&16
> blt.b L70 # l<->r swap not needed for narrow
> cmp.w %d6,%a1 # p.x : r.o.x
> ble.b L70 # if (r.o.x < p.x)
> mov.l %a1,%d0
> add.w %d7,%d0
> mov.l %d0,%a1 # r.o.x += w-1;
> add.w %d7,%d6 # p.x += w-1;
> mov.l &-1,%d4 # RtoL
> swap.w %d5 # masks in other order
> L70:
> L80:
> #
> #  Locate actual starting points
> #
> mov.l %d6,%d0 # p.y,,p.x
> swap.w %d0
> mov.l %d0,-(%sp) # p
> mov.l %a5,-(%sp) # dm
>
> mov.l &15,%d0
> lea.l $L82(%pc),%a0 # assume narrow
> cmp.w %d7,%d0 # w-1 : 15
> ble.b L81 # guessed correctly
> lea.l $L85(%pc),%a0 # wide
> L81:
> mov.l %a0,-(%sp) # on return, go directly to wide/narrow code
> add.w %a6,%a6; add.w %a6,%a6 # with 4*fc
>
> mov.w %d1,%d7 # h-1 in bits,,inner width in words
> and.l %d0,%d6 # 0,,bit offset of p.x
> mov.l %a1,%d1 # r.o.y,,r.o.x
> and.w %d1,%d0 # bit offset of r.o.x
> sub.w %d0,%d6 # BO(p.x) - BO(r.o.x) /* amount of right rotation */
> swap.w %d1 # r.o.x,,r.o.y
> mov.l %d1,-(%sp) # r.o
> mov.l %a4,-(%sp) # sm
> lea.l addr,%a3
> jsr (%a3)
> mov.l %a0,%a2 # src = addr(sm,r.origin);
> add.l &8,%sp
> jmp (%a3) # %a0 = addr(dm,p);
> L82:
> mov.l &0,%d4
> mov.w %d5,%d4 # 0,,mask1
> swap.w %d5 # mask1,,mask2  (proper long mask; maybe 16 bits too wide)
> and.w %d5,%d4 # check for overlap of mask1 and mask2
> beq.b L83 # no overlap ==> %d5 already correct
> mov.l %d4,%d5 # overlap ==> reduce %d5 by 16 bits
> swap.w %d5 # and put it in the proper half
> L83:
> swap.w %d7 # ,,height-1
> lea.l $nrwtab(%pc,%a6.w),%a6 # -> optab
> tst.w %d6 # amount of right rotation
> bge.b L84
> neg.w %d6
> add.l &2,%a6
> L84:
> add.w (%a6),%a6
> jmp (%a6)
>
> nrwtab:
> short opMnwr-nrwtab- 0, opMnwl-nrwtab- 2
> short opSnwr-nrwtab- 4, opSnwl-nrwtab- 6
> short opCnwr-nrwtab- 8, opCnwl-nrwtab-10
> short opXnwr-nrwtab-12, opXnwl-nrwtab-14
>
> opMnwr:
> mov.l (%a2),%d0
> mov.l (%a0),%d1
> ror.l %d6,%d0
> eor.l %d1,%d0
> and.l %d5,%d0
> eor.l %d1,%d0
> mov.l %d0,(%a0)
> add.w %d2,%a2
> add.w %d3,%a0
> dbr %d7,opMnwr
> br ret8
>
> opMnwl:
> mov.l (%a2),%d0
> mov.l (%a0),%d1
> rol.l %d6,%d0
> eor.l %d1,%d0
> and.l %d5,%d0
> eor.l %d1,%d0
> mov.l %d0,(%a0)
> add.w %d2,%a2
> add.w %d3,%a0
> dbr %d7,opMnwl
> br ret8
>
> opSnwr:
> mov.l (%a2),%d0
> ror.l %d6,%d0
> and.l %d5,%d0
> or.l %d0,(%a0)
> add.w %d2,%a2
> add.w %d3,%a0
> dbr %d7,opSnwr
> br ret8
>
> opSnwl:
> mov.l (%a2),%d0
> rol.l %d6,%d0
> and.l %d5,%d0
> or.l %d0,(%a0)
> add.w %d2,%a2
> add.w %d3,%a0
> dbr %d7,opSnwl
> br ret8
>
> opCnwr:
> mov.l (%a2),%d0
> ror.l %d6,%d0
> and.l %d5,%d0
> not.l %d0
> and.l %d0,(%a0)
> add.w %d2,%a2
> add.w %d3,%a0
> dbr %d7,opCnwr
> br ret8
>
> opCnwl:
> mov.l (%a2),%d0
> rol.l %d6,%d0
> and.l %d5,%d0
> not.l %d0
> and.l %d0,(%a0)
> add.w %d2,%a2
> add.w %d3,%a0
> dbr %d7,opCnwl
> br ret8
>
> opXnwr:
> mov.l (%a2),%d0
> ror.l %d6,%d0
> and.l %d5,%d0
> eor.l %d0,(%a0)
> add.w %d2,%a2
> add.w %d3,%a0
> dbr %d7,opXnwr
> br ret8
>
> opXnwl:
> mov.l (%a2),%d0
> rol.l %d6,%d0
> and.l %d5,%d0
> eor.l %d0,(%a0)
> add.w %d2,%a2
> add.w %d3,%a0
> dbr %d7,opXnwl
> br ret8
>
> set DBR,0x51c8
> set MOVLI,0x2000+074 # mov.l &...,
> set MOVWI,0x3000+074 # mov.w &...,
> set ADDWI,0x0640 # add.w &...,
>
> set FDFRAG,16 # first destination is a fragment
> set LDFRAG,17 # last destination is a fragment
> set NSHF1,18
> set FD2D,19 # first destination should store 2 words
> set LD2D,20 # last destination should store 2 words
> set FSTORE,21
> set DST1L,24 # dst inner count is 0
> set SRC1L,25 # Nsrc is 2
>
> set gensiz,80
>
> widtab:
> mov.w %d0,(%a0)+; short 0
> or.w %d0,(%a0)+; short 0
> and.w %d0,(%a0)+; not.w %d0
> eor.w %d0,(%a0)+; short 0
>
> #
> #  So far
> # %d7 == h-1 (bits),,w (words)
> # %d6 == 0,,rotate count
> # %d5 == mask2,,mask1
> # %d4 == -RtoL
> # %d3.w == wdst (bytes)
> # %d2.w == wsrc (bytes)
> # %a6.w == 4*fc
> # %a2 -> src
> # %a0 -> dst
> #
> L85:
> lea.l $widtab(%pc,%a6.w),%a6
> tst.w %d4; bpl.b L300; bset &31,%d6
> L300:
> mov.w %d7,%d0 # inner word count
> bne.b L304; bset &DST1L,%d6
> L304:
> add.w &1,%d0 # Nsrc = 1+Ninner
> mov.w %d0,%a1 #   + ...
> add.w &1,%d0 # Ndst = 1+Ninner+1
> add.w %d0,%d0 # magnitude of dst addressing side effects
> tst.l %d6; bpl.b L310
> neg.w %d0; add.l &2,%a0 # RtoL
> L310:
> sub.w %d0,%d3 # compensate dst for autoincrement
>
> mov.w %d5,%d4 # mask1
> swap.w %d5 # mask2
>
> cmp.w %d4,&-1;            beq.b L320; bset &FDFRAG,%d6
> L320:
>
> cmp.w %d5,&-1; seq.b %d1; beq.b L330; bset &LDFRAG,%d6
> L330:
>
> tst.w %d6; bne.b L360 # not NOSHIFT
> add.w &1,%a1 # Nsrc = 1+Ninner+1
> mov.l %d6,%d0; swap.w %d0; ext.w %d0 # 0,,flag bits
> asr.w &1,%d7; roxl.w &1,%d0 # account for inner words odd
> mov.b $nstab(%pc,%d0.w),%d0
> bpl.b L340; add.w &1,%d7
> L340:
> add.b %d0,%d0
> bpl.b L350; sub.w &1,%d7
> L350:
> swap.w %d0; eor.l %d0,%d6 # the bits
> btst &DST1L,%d6; bne.b L355
> btst &FD2D,%d6; beq.b L410
> L355:
> ext.l %d4; bmi.b L410; swap.w %d4; not.w %d4 # NOSHIFT mask1 .l
> br.b L410 # NOSHIFT mask2 .l
> nstab:
> byte 0x82,0x80,0x04,0x80 # 0x80: +1 inner;  0x40: -1 inner
> byte 0x02,0x00,0x44,0x00 # 0x04: FD2D;      0x02: NSHF1 no first word
> L360:
> ext.w %d1; sub.w %d1,%d7 # extend inner loop
>
> mov.l &0xf,%d0 # 0  1     7  8  9     e  f
> add.w &8,%d6 # 8  9     f  0  1     6  7
> and.w %d0,%d6
> sub.w &8,%d6 # 0  1     7 -8 -7    -2 -1  X=C= sign
> mov.w %d6,%d1; bge.b L367 #                    X unchanged
> neg.w %d1   #             8  7     2  1  X=C= 1
> L367:
> roxl.w &1,%d1 # 0  2     e 11  f     5  3
> and.w %d0,%d1 # 0  2     e  1  f     5  3
> lsl.w &8,%d1 # magic position
> short ADDWI+001
>  ror.l &8,%d0
> mov.w %d1,%a3 # the rotate instruction
>
> mov.l &0,%d1; not.w %d1 # 0,,-1
> ror.l %d6,%d1 # where the bits are after a rotate
>
> mov.w %d1,%d0; and.w %d4,%d0; beq.b L370 # 1 src word covers dst frag
> not.w %d1;     and.w %d4,%d1; beq.b L370
> add.w &1,%a1; br.b L390 # fragment needs another src word
> L370:
> sub.w &1,%d7 # .l takes an inner word
> bset &FD2D,%d6
> ext.l %d4; bmi.b L390
> swap.w %d4; not.w %d4 # mask1 .l
> L390:
>
> swap.w %d1
>
> mov.w %d1,%d0; and.w %d5,%d0; beq.b L400 # 1 src word covers dst frag
> not.w %d1;     and.w %d5,%d1; beq.b L400
> add.w &1,%a1; br.b L420 # fragment needs another src word
> L400:
> dbr %d7,L405 # .l takes an inner word
> clr.w %d7; br.b L420 # nothing there to take
> L405:
> L410:
> bset &LD2D,%d6
> ext.l %d5; bmi.b L420
> swap.w %d5; not.w %d5 # mask2 .l
> L420:
>
> tst.w NREG*4-4+fc+8(%sp); bne.b L430; bset &FSTORE,%d6
> L430:
> mov.w %a1,%d0 # Nsrc
> add.w %d0,%d0 # magnitude of src addressing side effects
> tst.l %d6; bpl.b L431
> neg.w %d0; add.l &2,%a2 # RtoL
> L431:
> sub.w %d0,%d2 # compensate src for autoincrement
>
> lea.l -gensiz(%sp),%sp
> mov.l %sp,%a5
> swap.w %d3
> swap.w %d2
>
> cmp.w %a1,&2; bgt L445
> short MOVWI+00000
>  mov.l (%a2)+,%d0
> tst.l %d6; bpl.b L432; add.w &010,%d0 # RtoL
> L432:
> mov.w %d0,(%a5)+
> mov.l &0,%d1; mov.w &-0x1000,%d2; mov.w &0100,%d3
> lea.l $L438(%pc),%a1
> mov.l &-1,%d0 # prepare bits to decide on "swap"
> tst.w %d6; bpl.b L432d; neg.w %d6
> lsl.l %d6,%d0; br.b L432e
> L432d:
> lsr.l %d6,%d0
> L432e:
> btst &DST1L,%d6; beq.b L434
> bset &FD2D,%d6; bne.b L432a
> ext.l %d4; bmi.b L432a; swap.w %d4; not.w %d4 # mask1 .l
> L432a:
> bset &LD2D,%d6; bne.b L432b
> ext.l %d5; bmi.b L432b; swap.w %d5; not.w %d5 # mask2 .l
> L432b:
> and.l %d5,%d4; mov.l %d4,%d5 # single .l does it all
> add.l &1,%d4; beq L730 # all 32 bits
> sub.l &1,%d4 # need an "and"
> and.l %d5,%d0
> cmp.l %d5,%d0
> beq.b L432c
> short MOVWI+05300
>  swap.w %d0
> L432c:
> tst.w %d6; bne L690 # and a rotate
> br.b L437 # NOSHIFT
> L434:
> mov.w %a3,(%a5)+ # the rotate instr
> short MOVWI+05300
>  mov.l %d0,%d1 # copy after rotate
> and.l %d4,%d0
> cmp.l %d4,%d0
> seq.b %d0; neg.b %d0; ext.w %d0
> short ADDWI+000
>  swap.w %d0
> mov.w %d0,(%a5)+
> lea.l $L436(%pc),%a1
> br.b L437
> L436:
>  and.w %d4,%d0
> mov.w &01001,%d1; clr.w %d2; clr.w %d3
> lea.l $L438(%pc),%a1
> L437:
> br L700
> L438:
>  and.w %d5,%d0
> br L545
> L445:
> #
> #  During compilation
> # %d7 == h-1,,w
> # %d6 == flags,,rotate count
> # %d5 == mask2
> # %d4 == mask1
> # %d3 == dst_dW,,bits for xxx.[wl]
> # %d2 == src_dW,,bits for mov.[wl]
> # %d1.w == parity
> # %a6 -> optab
> # %a5 -> next generated instruction
> # %a4 -> top of inner loop
> # %a3.w == rotate instruction
> # %a2 -> src
> # %a1 -> fragment "and" instruction
> # %a0 -> dst
> #
> tst.w %d6; bne.b L480 # not NOSHIFT ==> always need first word
> btst &NSHF1,%d6; bne.b L485 # interplay of NOSHIFT, odd, FDFRAG
> L480:
> mov.l &1,%d1
> and.w %d7,%d1 # parity of inner word count
> lsl.w &2,%d1 # even ==> frag in %d0, odd ==> frag in %d1
> bsr genwid # generate for first word
>  and.w %d4,%d0
> L485:
> cmp.w %d7,&2; ble.b L490 # inner dbr always falls through
> btst &FSTORE,%d6; beq.b L490 # no conflict "mov field" vs. %d6
> short MOVWI+05300 # init inner count
>  mov.w %a4,%d6
> L490:
> mov.l %a5,%a4 # top of inner loop
> asr.w &1,%d7 # check inner word count
> blt.b L540 # single .l does it all
> bcc.b L500 # even
> beq.b L520 # 1
> short MOVWI+05300
>  br.b L500 # jump into middle of inner loop
> add.l &1,%a4 # remember to fixup "br.b"
> add.w &1,%d7 # middle entry ==> no dbr offset
> L500:
> beq.b L530 # no inner words at all
> mov.l &4,%d1 # use %d1 in
> bsr.b genwid # even half of inner loop
>  short 0
> L510:
> mov.w %a4,%d0; neg.w %d0
> bclr &0,%d0; beq.b L520
> add.w %a5,%d0; mov.b %d0,(%a4)+ # fixup "br.b" into middle
> L520:
> mov.l &0,%d1 # use %d0 in
> bsr.b genwid # odd half of inner loop
>  short 0
> sub.w &1,%d7 # offset for inner dbr loop
> ble.b L530 # dbr always falls through
> mov.w &DBR+6,(%a5)+
> sub.l %a5,%a4; mov.w %a4,(%a5)+ # dbr displacement
> L530:
>
> btst &LDFRAG,%d6; beq.b L540 # omit "and" for full last word
> mov.l &4,%d1
> bsr.b genwid
>  and.w %d5,%d0
> L540:
>
> tst.w %d7; ble.b L545 # no inner loop
> btst &FSTORE,%d6; bne.b L545 # possible conflict "mov field" vs. %d6
> short MOVWI+05300 # init inner count
>  mov.w %a4,%d6
> L545:
> swap.w %d3; tst.w %d3; beq.b L546 # wdst is full width of bitmap
> mov.w %d3,%a1 # dst_dW
> short MOVWI+05300
>  add.w %a1,%a0
> L546:
> swap.w %d2; tst.w %d2; beq.b L547 # wsrc is full width of bitmap
> mov.w %d2,%a3 # src_dW
> short MOVWI+05300
>  add.w %a3,%a2
> L547:
> mov.w &DBR+7,(%a5)+
> mov.l %sp,%a4 # top of outer loop
> cmp.b (%a4),&0x60; bne.b L548 # not br.b
> mov.b 1(%a4),%d0; ext.w %d0; lea.l 2(%a4,%d0.w),%a4 # collapse branches
> L548:
> sub.l %a5,%a4; mov.w %a4,(%a5)+ # dbr displacement
> short MOVWI+05300
>  jmp (%a5)
>
> mov.w %d7,%a4 # init inner count
> mov.w %d7,%d6 # init inner count, 2nd case
> swap.w %d7   # h-1
> lea.l $retgen(%pc),%a5
> jmp (%sp)
>
> genwid:
> mov.l (%sp)+,%a1 # -> inline parameter
> mov.l $genget(%pc,%d1.w),%d0
> tst.w %d1; beq.b L550; mov.w &01001,%d1; swap.w %d1 # parity bits
> L550:
> clr.w %d2; clr.w %d3 # .[wl] bits default to .w
> tst.l %d6; bpl.b L560; add.w &010,%d0 # RtoL
> L560:
> tst.w %d6; bne.b L569 # not NOSHIFT
> bclr &9,%d0 # NOSHIFT always %d0
> mov.w (%a1),%d1; bne.b L564 # not inner loop
> btst &FSTORE,%d6; beq.b L562 # not "mov"
> mov.l &070,%d1; and.w %d0,%d1
> lsl.w &3,%d1; or.w %d1,%d0 # copy RtoL mode
> add.w &-0x1000,%d0 # .w ==> .l
> mov.w %d0,(%a5)+
> L561:
> jmp 2(%a1)
> genget:
> swap.w %d0; mov.w (%a2)+,%d0
> swap.w %d1; mov.w (%a2)+,%d1
>
> L562:
> mov.w &-0x1000,%d2; mov.w &0100,%d3 # .w +=> .l
> add.w %d2,%d0
> L563:
> mov.l &0,%d1 # NOSHIFT always %d0
> br L698 # assemble the fetch, then do the op
> L564:
> lsr.w &1,%d1; bcs.b L562 # NOSHIFT always LD2D
> btst &FD2D,%d6; bne.b L562
> br.b L563 # alas, .w
> L569:
> mov.w (%a1),%d1; beq.b L630 # inner loop
> L570:
> lsr.w &1,%d1; bcs.b L580 # last word
> add.w &-0x1000,%d0 # force fetch .l
> mov.w %d0,(%a5)+ # the fetch .l
> short MOVLI+00000
>  mov.l %d0,%d1
>  swap.w %d0
> clr.w %d1; eor.l %d1,%d0 # parity for mov.l %d[01],%d[10]
> tst.l %d1; sne.b %d1; sub.b %d1,%d0 # parity for swap.w %d[01]
> mov.l %d0,(%a5) # ran out of registers
> mov.l &0x4c80ec,%d0 # microcoded bits
> tst.l %d6; bpl.b L572; ror.l &1,%d0 # RtoL
> L572:
> tst.w %d6; bpl.b L574; ror.l &2,%d0 # rol
> L574:
> btst &FD2D,%d6; beq.b L576; ror.l &4,%d0 # first op .l
> mov.w &-0x1000,%d2; mov.w &0100,%d3 # .w +=> .l corrections
> L576:
> ror.l &1,%d0; bpl.b L578 # "swap" not needed
> add.l &2,%a5
> ror.l &8,%d0; bpl.b L577 # existing "swap" parity OK
> eor.w &1,(%a5)
> L577:
> ror.l &8,%d0; bpl.b L578 # existing order OK
> sub.l &2,%a5
> mov.l (%a5),%d0; swap.w %d0; mov.l %d0,(%a5)
> add.l &2,%a5
> L578:
> add.l &2,%a5
> swap.w %d1 # junk,,parity
> br.b L690
> L580:
> btst &LD2D,%d6; beq.b L630 # operator .w
> mov.w &-0x1000,%d2 # mov.w +=> mov.l
> mov.w &0100,%d3 # xxx.w +=> xxx.l
> L630:
> tst.l %d6; smi.b %d1
> eor.b %d6,%d1; bpl.b L650 # rotation in same direction as scan
> swap.w %d0 # interchange "swap" and "mov"
> L650:
> mov.l %d0,(%a5)+
>
> swap.w %d1 # junk,,parity
> mov.w (%a1),%d0; lsr.w &1,%d0; bcs.b L660 # last word
> short MOVWI+000
>  mov.l %d0,%d1
> eor.w %d1,%d0
> mov.w %d0,(%a5)+
> br.b L690
> L660:
> tst.l %d6; bmi.b L690 # RtoL
> btst &LD2D,%d6; beq.b L690 # not .l
> tst.w %d6; bpl.b L670 # ror
> sub.l &2,%a5; br.b L690 # no "swap"
> L670:
> mov.w -4(%a5),(%a5)+ # extra "swap"
> L690:
> mov.w %a3,%d0
> eor.b %d1,%d0
> L698:
> mov.w %d0,(%a5)+ # the rotate instruction
> L700:
>
> mov.w (%a1),%d0; beq.b L730 # inner loop
> btst &0,%d0; bne.b L705 # last word
> btst &FDFRAG,%d6; beq.b L730 # no "and"
> L705:
> add.w %d3,%d0; add.w %d1,%d0; sub.b %d1,%d0 # and.[wl] %d[45],%d[01]
> btst &FSTORE,%d6; beq.b L720
> # "mov" partial word
> swap.w %d0 # save the "and"
> short MOVWI+00000 # ,%d0
>  mov.w (%a0),%d6
> add.w %d2,%d0 # mov.[wl]
> tst.l %d6; bpl.b L710; add.w &020,%d0 # RtoL; "(%a0)" ==> "-(%a0)"
> L710:
> mov.w %d0,(%a5)+ # instr to fetch memory part of word
> short MOVWI+00000 # ,%d0
>  eor.w %d6,%d0
> add.w %d3,%d0; add.b %d1,%d0 # eor.[wl] %d6,%d[01]
> swap.w %d0; mov.l %d0,(%a5)+; swap.w %d0; mov.w %d0,(%a5)+
> mov.w %d2,%d0; add.b %d1,%d0 # mov.[wl] %d[01],
> mov.l &-0100,%d1 # RtoL correction, if necessary
> br.b L770
> L720:
> mov.w %d0,(%a5)+ # "and" for non-mov operators
> L730:
> mov.w 2(%a6),%d0; beq.b L740 # not F_CLR
> add.w %d3,%d0; add.b %d1,%d0 # not.[wl] %d[01]
> mov.w %d0,(%a5)+
> L740:
> btst &FSTORE,%d6; beq.b L790 # non-"mov"
> mov.w %d2,%d0; add.b %d1,%d0 # mov.[wl] %d[01],
> mov.l &0100,%d1 # RtoL correction, if necessary
> L770:
> add.w (%a6),%d0
> tst.l %d6; bpl.b L780
> add.w %d1,%d0 # RtoL correction
> L780:
> mov.w %d0,(%a5)+
> jmp 2(%a1)
>
> L790:
> mov.w %d1,%d0; clr.b %d0; add.w %d3,%d0 # xxx.[wl] %d[01]
> mov.l &010,%d1 # RtoL correction, if necessary
> br.b L770
>
> #
> #  During execution
> # %d[01] == rotator
> # %d2 [reserved for texture bits]
> # %d3 [reserved for texture index]
> # %d4 == mask1
> # %d5 == mask2
> # %d6.w == inner count
> # %d7.w == outer count
> # %a0 -> dst
> # %a1 == dst_dW
> # %a2 -> src
> # %a3 == src_dW
> # %a4.w == inner count init
> # %a5 -> retgen
> # %a6 [reserved for -> texture]
> #
>