It is in file blit/src/libj/bitblt.s (attached below for convenience). It is 750 lines of 68K assembler. It does not appear to have been ported to the Bellmac 32 CPU. Maybe it did not make sense in that context.
#
# bitblt(sm,r,dm,p,fc)
# Bitmap *sm,*dm;
# Rectangle r;
# Point p;
# int fc;
#
# by John F. Reiser summer 1982
#
# Depending on the case at hand, generate very good code and execute it.
#
# offsets in a Point
set x,0
set y,2
# offsets in a Rectangle
set origin,0
set corner,4
# offsets in a Bitmap
set base,0
set width,4
set rect,6
# parameter offsets from %fp
set sm,8
set r,12
set dm,20
set p,24
set fc,28
set NREG,11
global bitblt
bitblt:
movm.l &0x3f3e,-(%sp) # save C registers
movm.l NREG*4-4+sm(%sp),&0x001f
# d1=r.o.x,,r.o.y; d2=r.c.x,,r.c.y; d4=p.x,,p.y;
mov.l %d0,%a4 # sm
mov.l %d3,%a5 # dm
mov.w NREG*4-4+fc(%sp),%a6 # a6.w == fc
movm.l rect(%a4),&0x9 # d0=sm.o.x,,sm.o.y; d3=sm.c.x,,sm.c.y;
movm.l rect(%a5),&0x60 # d5=dm.o.x,,dm.o.y; d6=dm.c.x,,dm.c.y;
lea.l $L50(%pc),%a0
L5:
# clip r.y to sm.y
mov.w %d0,%d7 # sm.o.y
sub.w %d1,%d7 # - r.o.y
ble.b L10
mov.w %d0,%d1 # r.o.y = sm.o.y; /* r.o.y was above sm.rect */
add.w %d7,%d4 # p.y parallels r.o.y
L10:
cmp.w %d2,%d3 # r.c.y : sm.c.y
ble.b L20
mov.w %d3,%d2 # r.c.y = sm.c.y; /* bottom of r was below sm.rect */
L20:
# clip (r.y at p.y) to dm.y
mov.w %d5,%d7 # dm.o.y
sub.w %d4,%d7 # -p.y
ble.b L30
mov.w %d5,%d4 # p.y = dm.o.y; /* p.y was above dm.rect */
add.w %d7,%d1 # r.o.y parallels p.y
L30:
mov.w %d1,%d7 # r.o.y
add.w %d6,%d7 # + dm.c.y
sub.w %d4,%d7 # - p.y /* == max y that dm.rect allows in r */
cmp.w %d2,%d7 # r.c.y : limit
ble.b L40
mov.w %d7,%d2 # r.c.y = limit
L40:
mov.w %d2,%d7 # r.c.y
sub.w %d1,%d7 # - r.o.y
sub.w &1,%d7 # /* == h-1 in bits */
blt.b ret
jmp (%a0)
retgen:
lea.l gensiz(%sp),%sp
ret8:
add.l &8,%sp
ret:
movm.l (%sp)+,&0x7cfc
rts
L50:
# mirror in pi/4 and reuse same code to clip x
swap.w %d0; swap.w %d1; swap.w %d2; swap.w %d3
swap.w %d4; swap.w %d5; swap.w %d6; swap.w %d7
lea.l $L55(%pc),%a0
br.b L5
L55:
mov.l %d1,%a1
mov.l %d4,%d6
#
# So far
# %d7 == h-1,,w-1
# %d6 == p.y,,p.x
# %a6.w == fc
# %a5 == dm
# %a4 == sm
# %a1 == r.o.y,,r.o.x
#
# Compute masks, and width in words
#
mov.w %d6,%d0 # p.x /* left endpoint of dst */
mov.w %d7,%d1 # w-1
add.w %d6,%d1 # right endpoint
mov.l &-1,%d3
mov.l &15,%d2
and.w %d0,%d2
lsr.w %d2,%d3 # mask1
mov.l &-1,%d5
mov.l &15,%d2
and.w %d1,%d2
add.w &1,%d2
lsr.w %d2,%d5
not.w %d5 # mask2
swap.w %d5
mov.w %d3,%d5 # mask2,,mask1
asr.w &4,%d0
asr.w &4,%d1
sub.w %d0,%d1
sub.w &1,%d1 # inner-loop width in words
mov.l &0,%d4 # assume LtoR
mov.w width(%a5),%d3
add.w %d3,%d3
mov.w width(%a4),%d2
add.w %d2,%d2
#
# So far
# %d7 == h-1,,w-1 in bits
# %d6 == p.y,,p.x
# %d5 == mask2,,mask1
# %d4 == 0 (LtoR)
# %d3.w == dm width in bytes
# %d2.w == sm width in bytes
# %d1.w == inner-loop width in words
# %a6.w == fc
# %a5 == dm
# %a4 == sm
# %a1 == r.o.y,,r.o.x
#
# If necessary, compensate for overlap of source and destination
#
cmp.l %a4,%a5
bne.b L80 # overlap not possible
mov.l %d6,%d0 # p.y,,p.x
mov.w %a1,%d0 # p.y,,r.o.x
cmp.l %a1,%d0 # r.o.y : p.y
bge.b L60 # if (r.o.y < p.y)
mov.l %d7,%d0 # h-1,,w-1
clr.w %d0 # h-1,,0
add.l %d0,%a1 # r.o.y += h-1;
add.l %d0,%d6 # p.y += h-1;
neg.w %d3 # wdst = -wdst;
neg.w %d2 # wsrc = -wsrc;
L60:
cmp.w %d7,&16
blt.b L70 # l<->r swap not needed for narrow
cmp.w %d6,%a1 # p.x : r.o.x
ble.b L70 # if (r.o.x < p.x)
mov.l %a1,%d0
add.w %d7,%d0
mov.l %d0,%a1 # r.o.x += w-1;
add.w %d7,%d6 # p.x += w-1;
mov.l &-1,%d4 # RtoL
swap.w %d5 # masks in other order
L70:
L80:
#
# Locate actual starting points
#
mov.l %d6,%d0 # p.y,,p.x
swap.w %d0
mov.l %d0,-(%sp) # p
mov.l %a5,-(%sp) # dm
mov.l &15,%d0
lea.l $L82(%pc),%a0 # assume narrow
cmp.w %d7,%d0 # w-1 : 15
ble.b L81 # guessed correctly
lea.l $L85(%pc),%a0 # wide
L81:
mov.l %a0,-(%sp) # on return, go directly to wide/narrow code
add.w %a6,%a6; add.w %a6,%a6 # with 4*fc
mov.w %d1,%d7 # h-1 in bits,,inner width in words
and.l %d0,%d6 # 0,,bit offset of p.x
mov.l %a1,%d1 # r.o.y,,r.o.x
and.w %d1,%d0 # bit offset of r.o.x
sub.w %d0,%d6 # BO(p.x) - BO(r.o.x) /* amount of right rotation */
swap.w %d1 # r.o.x,,r.o.y
mov.l %d1,-(%sp) # r.o
mov.l %a4,-(%sp) # sm
lea.l addr,%a3
jsr (%a3)
mov.l %a0,%a2 # src = addr(sm,r.origin);
add.l &8,%sp
jmp (%a3) # %a0 = addr(dm,p);
L82:
mov.l &0,%d4
mov.w %d5,%d4 # 0,,mask1
swap.w %d5 # mask1,,mask2 (proper long mask; maybe 16 bits too wide)
and.w %d5,%d4 # check for overlap of mask1 and mask2
beq.b L83 # no overlap ==> %d5 already correct
mov.l %d4,%d5 # overlap ==> reduce %d5 by 16 bits
swap.w %d5 # and put it in the proper half
L83:
swap.w %d7 # ,,height-1
lea.l $nrwtab(%pc,%a6.w),%a6 # -> optab
tst.w %d6 # amount of right rotation
bge.b L84
neg.w %d6
add.l &2,%a6
L84:
add.w (%a6),%a6
jmp (%a6)
nrwtab:
short opMnwr-nrwtab- 0, opMnwl-nrwtab- 2
short opSnwr-nrwtab- 4, opSnwl-nrwtab- 6
short opCnwr-nrwtab- 8, opCnwl-nrwtab-10
short opXnwr-nrwtab-12, opXnwl-nrwtab-14
opMnwr:
mov.l (%a2),%d0
mov.l (%a0),%d1
ror.l %d6,%d0
eor.l %d1,%d0
and.l %d5,%d0
eor.l %d1,%d0
mov.l %d0,(%a0)
add.w %d2,%a2
add.w %d3,%a0
dbr %d7,opMnwr
br ret8
opMnwl:
mov.l (%a2),%d0
mov.l (%a0),%d1
rol.l %d6,%d0
eor.l %d1,%d0
and.l %d5,%d0
eor.l %d1,%d0
mov.l %d0,(%a0)
add.w %d2,%a2
add.w %d3,%a0
dbr %d7,opMnwl
br ret8
opSnwr:
mov.l (%a2),%d0
ror.l %d6,%d0
and.l %d5,%d0
or.l %d0,(%a0)
add.w %d2,%a2
add.w %d3,%a0
dbr %d7,opSnwr
br ret8
opSnwl:
mov.l (%a2),%d0
rol.l %d6,%d0
and.l %d5,%d0
or.l %d0,(%a0)
add.w %d2,%a2
add.w %d3,%a0
dbr %d7,opSnwl
br ret8
opCnwr:
mov.l (%a2),%d0
ror.l %d6,%d0
and.l %d5,%d0
not.l %d0
and.l %d0,(%a0)
add.w %d2,%a2
add.w %d3,%a0
dbr %d7,opCnwr
br ret8
opCnwl:
mov.l (%a2),%d0
rol.l %d6,%d0
and.l %d5,%d0
not.l %d0
and.l %d0,(%a0)
add.w %d2,%a2
add.w %d3,%a0
dbr %d7,opCnwl
br ret8
opXnwr:
mov.l (%a2),%d0
ror.l %d6,%d0
and.l %d5,%d0
eor.l %d0,(%a0)
add.w %d2,%a2
add.w %d3,%a0
dbr %d7,opXnwr
br ret8
opXnwl:
mov.l (%a2),%d0
rol.l %d6,%d0
and.l %d5,%d0
eor.l %d0,(%a0)
add.w %d2,%a2
add.w %d3,%a0
dbr %d7,opXnwl
br ret8
set DBR,0x51c8
set MOVLI,0x2000+074 # mov.l &...,
set MOVWI,0x3000+074 # mov.w &...,
set ADDWI,0x0640 # add.w &...,
set FDFRAG,16 # first destination is a fragment
set LDFRAG,17 # last destination is a fragment
set NSHF1,18
set FD2D,19 # first destination should store 2 words
set LD2D,20 # last destination should store 2 words
set FSTORE,21
set DST1L,24 # dst inner count is 0
set SRC1L,25 # Nsrc is 2
set gensiz,80
widtab:
mov.w %d0,(%a0)+; short 0
or.w %d0,(%a0)+; short 0
and.w %d0,(%a0)+; not.w %d0
eor.w %d0,(%a0)+; short 0
#
# So far
# %d7 == h-1 (bits),,w (words)
# %d6 == 0,,rotate count
# %d5 == mask2,,mask1
# %d4 == -RtoL
# %d3.w == wdst (bytes)
# %d2.w == wsrc (bytes)
# %a6.w == 4*fc
# %a2 -> src
# %a0 -> dst
#
L85:
lea.l $widtab(%pc,%a6.w),%a6
tst.w %d4; bpl.b L300; bset &31,%d6
L300:
mov.w %d7,%d0 # inner word count
bne.b L304; bset &DST1L,%d6
L304:
add.w &1,%d0 # Nsrc = 1+Ninner
mov.w %d0,%a1 # + ...
add.w &1,%d0 # Ndst = 1+Ninner+1
add.w %d0,%d0 # magnitude of dst addressing side effects
tst.l %d6; bpl.b L310
neg.w %d0; add.l &2,%a0 # RtoL
L310:
sub.w %d0,%d3 # compensate dst for autoincrement
mov.w %d5,%d4 # mask1
swap.w %d5 # mask2
cmp.w %d4,&-1; beq.b L320; bset &FDFRAG,%d6
L320:
cmp.w %d5,&-1; seq.b %d1; beq.b L330; bset &LDFRAG,%d6
L330:
tst.w %d6; bne.b L360 # not NOSHIFT
add.w &1,%a1 # Nsrc = 1+Ninner+1
mov.l %d6,%d0; swap.w %d0; ext.w %d0 # 0,,flag bits
asr.w &1,%d7; roxl.w &1,%d0 # account for inner words odd
mov.b $nstab(%pc,%d0.w),%d0
bpl.b L340; add.w &1,%d7
L340:
add.b %d0,%d0
bpl.b L350; sub.w &1,%d7
L350:
swap.w %d0; eor.l %d0,%d6 # the bits
btst &DST1L,%d6; bne.b L355
btst &FD2D,%d6; beq.b L410
L355:
ext.l %d4; bmi.b L410; swap.w %d4; not.w %d4 # NOSHIFT mask1 .l
br.b L410 # NOSHIFT mask2 .l
nstab:
byte 0x82,0x80,0x04,0x80 # 0x80: +1 inner; 0x40: -1 inner
byte 0x02,0x00,0x44,0x00 # 0x04: FD2D; 0x02: NSHF1 no first word
L360:
ext.w %d1; sub.w %d1,%d7 # extend inner loop
mov.l &0xf,%d0 # 0 1 7 8 9 e f
add.w &8,%d6 # 8 9 f 0 1 6 7
and.w %d0,%d6
sub.w &8,%d6 # 0 1 7 -8 -7 -2 -1 X=C= sign
mov.w %d6,%d1; bge.b L367 # X unchanged
neg.w %d1 # 8 7 2 1 X=C= 1
L367:
roxl.w &1,%d1 # 0 2 e 11 f 5 3
and.w %d0,%d1 # 0 2 e 1 f 5 3
lsl.w &8,%d1 # magic position
short ADDWI+001
ror.l &8,%d0
mov.w %d1,%a3 # the rotate instruction
mov.l &0,%d1; not.w %d1 # 0,,-1
ror.l %d6,%d1 # where the bits are after a rotate
mov.w %d1,%d0; and.w %d4,%d0; beq.b L370 # 1 src word covers dst frag
not.w %d1; and.w %d4,%d1; beq.b L370
add.w &1,%a1; br.b L390 # fragment needs another src word
L370:
sub.w &1,%d7 # .l takes an inner word
bset &FD2D,%d6
ext.l %d4; bmi.b L390
swap.w %d4; not.w %d4 # mask1 .l
L390:
swap.w %d1
mov.w %d1,%d0; and.w %d5,%d0; beq.b L400 # 1 src word covers dst frag
not.w %d1; and.w %d5,%d1; beq.b L400
add.w &1,%a1; br.b L420 # fragment needs another src word
L400:
dbr %d7,L405 # .l takes an inner word
clr.w %d7; br.b L420 # nothing there to take
L405:
L410:
bset &LD2D,%d6
ext.l %d5; bmi.b L420
swap.w %d5; not.w %d5 # mask2 .l
L420:
tst.w NREG*4-4+fc+8(%sp); bne.b L430; bset &FSTORE,%d6
L430:
mov.w %a1,%d0 # Nsrc
add.w %d0,%d0 # magnitude of src addressing side effects
tst.l %d6; bpl.b L431
neg.w %d0; add.l &2,%a2 # RtoL
L431:
sub.w %d0,%d2 # compensate src for autoincrement
lea.l -gensiz(%sp),%sp
mov.l %sp,%a5
swap.w %d3
swap.w %d2
cmp.w %a1,&2; bgt L445
short MOVWI+00000
mov.l (%a2)+,%d0
tst.l %d6; bpl.b L432; add.w &010,%d0 # RtoL
L432:
mov.w %d0,(%a5)+
mov.l &0,%d1; mov.w &-0x1000,%d2; mov.w &0100,%d3
lea.l $L438(%pc),%a1
mov.l &-1,%d0 # prepare bits to decide on "swap"
tst.w %d6; bpl.b L432d; neg.w %d6
lsl.l %d6,%d0; br.b L432e
L432d:
lsr.l %d6,%d0
L432e:
btst &DST1L,%d6; beq.b L434
bset &FD2D,%d6; bne.b L432a
ext.l %d4; bmi.b L432a; swap.w %d4; not.w %d4 # mask1 .l
L432a:
bset &LD2D,%d6; bne.b L432b
ext.l %d5; bmi.b L432b; swap.w %d5; not.w %d5 # mask2 .l
L432b:
and.l %d5,%d4; mov.l %d4,%d5 # single .l does it all
add.l &1,%d4; beq L730 # all 32 bits
sub.l &1,%d4 # need an "and"
and.l %d5,%d0
cmp.l %d5,%d0
beq.b L432c
short MOVWI+05300
swap.w %d0
L432c:
tst.w %d6; bne L690 # and a rotate
br.b L437 # NOSHIFT
L434:
mov.w %a3,(%a5)+ # the rotate instr
short MOVWI+05300
mov.l %d0,%d1 # copy after rotate
and.l %d4,%d0
cmp.l %d4,%d0
seq.b %d0; neg.b %d0; ext.w %d0
short ADDWI+000
swap.w %d0
mov.w %d0,(%a5)+
lea.l $L436(%pc),%a1
br.b L437
L436:
and.w %d4,%d0
mov.w &01001,%d1; clr.w %d2; clr.w %d3
lea.l $L438(%pc),%a1
L437:
br L700
L438:
and.w %d5,%d0
br L545
L445:
#
# During compilation
# %d7 == h-1,,w
# %d6 == flags,,rotate count
# %d5 == mask2
# %d4 == mask1
# %d3 == dst_dW,,bits for xxx.[wl]
# %d2 == src_dW,,bits for mov.[wl]
# %d1.w == parity
# %a6 -> optab
# %a5 -> next generated instruction
# %a4 -> top of inner loop
# %a3.w == rotate instruction
# %a2 -> src
# %a1 -> fragment "and" instruction
# %a0 -> dst
#
tst.w %d6; bne.b L480 # not NOSHIFT ==> always need first word
btst &NSHF1,%d6; bne.b L485 # interplay of NOSHIFT, odd, FDFRAG
L480:
mov.l &1,%d1
and.w %d7,%d1 # parity of inner word count
lsl.w &2,%d1 # even ==> frag in %d0, odd ==> frag in %d1
bsr genwid # generate for first word
and.w %d4,%d0
L485:
cmp.w %d7,&2; ble.b L490 # inner dbr always falls through
btst &FSTORE,%d6; beq.b L490 # no conflict "mov field" vs. %d6
short MOVWI+05300 # init inner count
mov.w %a4,%d6
L490:
mov.l %a5,%a4 # top of inner loop
asr.w &1,%d7 # check inner word count
blt.b L540 # single .l does it all
bcc.b L500 # even
beq.b L520 # 1
short MOVWI+05300
br.b L500 # jump into middle of inner loop
add.l &1,%a4 # remember to fixup "br.b"
add.w &1,%d7 # middle entry ==> no dbr offset
L500:
beq.b L530 # no inner words at all
mov.l &4,%d1 # use %d1 in
bsr.b genwid # even half of inner loop
short 0
L510:
mov.w %a4,%d0; neg.w %d0
bclr &0,%d0; beq.b L520
add.w %a5,%d0; mov.b %d0,(%a4)+ # fixup "br.b" into middle
L520:
mov.l &0,%d1 # use %d0 in
bsr.b genwid # odd half of inner loop
short 0
sub.w &1,%d7 # offset for inner dbr loop
ble.b L530 # dbr always falls through
mov.w &DBR+6,(%a5)+
sub.l %a5,%a4; mov.w %a4,(%a5)+ # dbr displacement
L530:
btst &LDFRAG,%d6; beq.b L540 # omit "and" for full last word
mov.l &4,%d1
bsr.b genwid
and.w %d5,%d0
L540:
tst.w %d7; ble.b L545 # no inner loop
btst &FSTORE,%d6; bne.b L545 # possible conflict "mov field" vs. %d6
short MOVWI+05300 # init inner count
mov.w %a4,%d6
L545:
swap.w %d3; tst.w %d3; beq.b L546 # wdst is full width of bitmap
mov.w %d3,%a1 # dst_dW
short MOVWI+05300
add.w %a1,%a0
L546:
swap.w %d2; tst.w %d2; beq.b L547 # wsrc is full width of bitmap
mov.w %d2,%a3 # src_dW
short MOVWI+05300
add.w %a3,%a2
L547:
mov.w &DBR+7,(%a5)+
mov.l %sp,%a4 # top of outer loop
cmp.b (%a4),&0x60; bne.b L548 # not br.b
mov.b 1(%a4),%d0; ext.w %d0; lea.l 2(%a4,%d0.w),%a4 # collapse branches
L548:
sub.l %a5,%a4; mov.w %a4,(%a5)+ # dbr displacement
short MOVWI+05300
jmp (%a5)
mov.w %d7,%a4 # init inner count
mov.w %d7,%d6 # init inner count, 2nd case
swap.w %d7 # h-1
lea.l $retgen(%pc),%a5
jmp (%sp)
genwid:
mov.l (%sp)+,%a1 # -> inline parameter
mov.l $genget(%pc,%d1.w),%d0
tst.w %d1; beq.b L550; mov.w &01001,%d1; swap.w %d1 # parity bits
L550:
clr.w %d2; clr.w %d3 # .[wl] bits default to .w
tst.l %d6; bpl.b L560; add.w &010,%d0 # RtoL
L560:
tst.w %d6; bne.b L569 # not NOSHIFT
bclr &9,%d0 # NOSHIFT always %d0
mov.w (%a1),%d1; bne.b L564 # not inner loop
btst &FSTORE,%d6; beq.b L562 # not "mov"
mov.l &070,%d1; and.w %d0,%d1
lsl.w &3,%d1; or.w %d1,%d0 # copy RtoL mode
add.w &-0x1000,%d0 # .w ==> .l
mov.w %d0,(%a5)+
L561:
jmp 2(%a1)
genget:
swap.w %d0; mov.w (%a2)+,%d0
swap.w %d1; mov.w (%a2)+,%d1
L562:
mov.w &-0x1000,%d2; mov.w &0100,%d3 # .w +=> .l
add.w %d2,%d0
L563:
mov.l &0,%d1 # NOSHIFT always %d0
br L698 # assemble the fetch, then do the op
L564:
lsr.w &1,%d1; bcs.b L562 # NOSHIFT always LD2D
btst &FD2D,%d6; bne.b L562
br.b L563 # alas, .w
L569:
mov.w (%a1),%d1; beq.b L630 # inner loop
L570:
lsr.w &1,%d1; bcs.b L580 # last word
add.w &-0x1000,%d0 # force fetch .l
mov.w %d0,(%a5)+ # the fetch .l
short MOVLI+00000
mov.l %d0,%d1
swap.w %d0
clr.w %d1; eor.l %d1,%d0 # parity for mov.l %d[01],%d[10]
tst.l %d1; sne.b %d1; sub.b %d1,%d0 # parity for swap.w %d[01]
mov.l %d0,(%a5) # ran out of registers
mov.l &0x4c80ec,%d0 # microcoded bits
tst.l %d6; bpl.b L572; ror.l &1,%d0 # RtoL
L572:
tst.w %d6; bpl.b L574; ror.l &2,%d0 # rol
L574:
btst &FD2D,%d6; beq.b L576; ror.l &4,%d0 # first op .l
mov.w &-0x1000,%d2; mov.w &0100,%d3 # .w +=> .l corrections
L576:
ror.l &1,%d0; bpl.b L578 # "swap" not needed
add.l &2,%a5
ror.l &8,%d0; bpl.b L577 # existing "swap" parity OK
eor.w &1,(%a5)
L577:
ror.l &8,%d0; bpl.b L578 # existing order OK
sub.l &2,%a5
mov.l (%a5),%d0; swap.w %d0; mov.l %d0,(%a5)
add.l &2,%a5
L578:
add.l &2,%a5
swap.w %d1 # junk,,parity
br.b L690
L580:
btst &LD2D,%d6; beq.b L630 # operator .w
mov.w &-0x1000,%d2 # mov.w +=> mov.l
mov.w &0100,%d3 # xxx.w +=> xxx.l
L630:
tst.l %d6; smi.b %d1
eor.b %d6,%d1; bpl.b L650 # rotation in same direction as scan
swap.w %d0 # interchange "swap" and "mov"
L650:
mov.l %d0,(%a5)+
swap.w %d1 # junk,,parity
mov.w (%a1),%d0; lsr.w &1,%d0; bcs.b L660 # last word
short MOVWI+000
mov.l %d0,%d1
eor.w %d1,%d0
mov.w %d0,(%a5)+
br.b L690
L660:
tst.l %d6; bmi.b L690 # RtoL
btst &LD2D,%d6; beq.b L690 # not .l
tst.w %d6; bpl.b L670 # ror
sub.l &2,%a5; br.b L690 # no "swap"
L670:
mov.w -4(%a5),(%a5)+ # extra "swap"
L690:
mov.w %a3,%d0
eor.b %d1,%d0
L698:
mov.w %d0,(%a5)+ # the rotate instruction
L700:
mov.w (%a1),%d0; beq.b L730 # inner loop
btst &0,%d0; bne.b L705 # last word
btst &FDFRAG,%d6; beq.b L730 # no "and"
L705:
add.w %d3,%d0; add.w %d1,%d0; sub.b %d1,%d0 # and.[wl] %d[45],%d[01]
btst &FSTORE,%d6; beq.b L720
# "mov" partial word
swap.w %d0 # save the "and"
short MOVWI+00000 # ,%d0
mov.w (%a0),%d6
add.w %d2,%d0 # mov.[wl]
tst.l %d6; bpl.b L710; add.w &020,%d0 # RtoL; "(%a0)" ==> "-(%a0)"
L710:
mov.w %d0,(%a5)+ # instr to fetch memory part of word
short MOVWI+00000 # ,%d0
eor.w %d6,%d0
add.w %d3,%d0; add.b %d1,%d0 # eor.[wl] %d6,%d[01]
swap.w %d0; mov.l %d0,(%a5)+; swap.w %d0; mov.w %d0,(%a5)+
mov.w %d2,%d0; add.b %d1,%d0 # mov.[wl] %d[01],
mov.l &-0100,%d1 # RtoL correction, if necessary
br.b L770
L720:
mov.w %d0,(%a5)+ # "and" for non-mov operators
L730:
mov.w 2(%a6),%d0; beq.b L740 # not F_CLR
add.w %d3,%d0; add.b %d1,%d0 # not.[wl] %d[01]
mov.w %d0,(%a5)+
L740:
btst &FSTORE,%d6; beq.b L790 # non-"mov"
mov.w %d2,%d0; add.b %d1,%d0 # mov.[wl] %d[01],
mov.l &0100,%d1 # RtoL correction, if necessary
L770:
add.w (%a6),%d0
tst.l %d6; bpl.b L780
add.w %d1,%d0 # RtoL correction
L780:
mov.w %d0,(%a5)+
jmp 2(%a1)
L790:
mov.w %d1,%d0; clr.b %d0; add.w %d3,%d0 # xxx.[wl] %d[01]
mov.l &010,%d1 # RtoL correction, if necessary
br.b L770
#
# During execution
# %d[01] == rotator
# %d2 [reserved for texture bits]
# %d3 [reserved for texture index]
# %d4 == mask1
# %d5 == mask2
# %d6.w == inner count
# %d7.w == outer count
# %a0 -> dst
# %a1 == dst_dW
# %a2 -> src
# %a3 == src_dW
# %a4.w == inner count init
# %a5 -> retgen
# %a6 [reserved for -> texture]
#