mtl/src/memcpy.s

218 lines
4.9 KiB
ArmAsm

// Load as ARM instructions into GBA IWRAM section.
// IWRAM section has a 32bit buffer instead of 16bit, so it is
// able to load arm instructions faster.
.section .iwram, "ax", %progbits
.arm
.align 2
// Copies num_byte from dst to src.
// dst and src !! MUST BE WORD ALIGNED !!
// however, num_byte does not need to be aligned
// r0, r1: dst, src
// r2: num_byte
.global mtl_memcpy32
.type mtl_memcpy32 STT_FUNC
mtl_memcpy32:
// r12 = num_residual_byte
and r12, r2, #31
// r2 = num_chunk
lsrs r2, r2, #5
// Skip chunk copy if there are no chunks
beq .Lword_process
.Lchunk_process:
// Preserve local variables of calling function
push {r4-r10}
.Lchunk_copy:
// Load 8 word chunk from src into registers, increment src after
ldmia r1!, {r3-r10}
// Store 8 word chunk from register into dst, increment dst after
stmia r0!, {r3-r10}
// Copy again if more than zero chunks left
subs r2, r2, #1
bhi .Lchunk_copy
// Restore local variables of calling function
pop {r4-r10}
.Lword_process:
// r2 = num_word
lsrs r2, r12, #2
// Skip word copy if there are no words
beq .Lbyte_process
.Lword_copy:
// Load word from src into register, increment src after
ldr r3, [r1], #4
// Store word from register into dst, increment dst after
str r3, [r0], #4
// Copy again if more than zero words left
subs r2, r2, #1
bhi .Lword_copy
.Lbyte_process:
// r12 = num_residual_byte
and r12, r12, #3
.Lbyte_copy:
// Decrement byte count ahead of time, by checking the carry bit
// during load/store/branch, we can avoid an unnecessary branch.
// !REMEMBER, carry = !borrow. Carry is set when subtraction does
// not underflow. IE. num of bytes was > 0
subs r12, r12, #1
// Load byte from src into register, increment src after
ldrcsb r3, [r1], #1
// Load byte from register into dst, increment dst after
strcsb r3, [r0], #1
// Copy again if more bytes left, checks for zero and carry bit set
bhi .Lbyte_copy
// Return
bx lr
.global mtl_dumbcpy16
.type mtl_dumbcpy16 STT_FUNC
mtl_dumbcpy16:
// r12 = has residual byte
and r12, r2, #1
// r2 = num_hword
lsr r2, r2, #1
.Lhword_copy:
// Copy half words
subs r2, r2, #1
ldrcsh r3, [r1], #2
strcsh r3, [r0], #2
bhi .Lhword_copy
// Copy residual byte if needed
cmp r12, #0
ldrneb r3, [r1]
strneb r3, [r0]
bx lr
// Performs a generic byte-by-byte memcpy.
// Still faster than std::memcpy due to being put in IWRAM
// r0, r1: dst, src
// r2: num_bytes
.global mtl_dumbcpy
.type mtl_dumbcpy STT_FUNC
mtl_dumbcpy:
subs r2, r2, #1
ldrcsb r3, [r1], #1
strcsb r3, [r0], #1
bhi mtl_dumbcpy
bx lr
// Calls mtl_memcpy32 if src and dst are word aligned,
// otherwise calls mtl_dumbcpy
.global mtl_hybridcpy
.type mtl_hybridcpy STT_FUNC
mtl_hybridcpy:
orr r3, r0, r1
ands r12, r3, #1
bne mtl_dumbcpy
ands r12, r3, #2
bne mtl_dumbcpy16
b mtl_memcpy32
// asdf
.global mtl_rmemcpy32
.type mtl_rmemcpy32 STT_FUNC
mtl_rmemcpy32:
// Move to last byte of src and dst
add r0, r2
add r1, r2
sub r0, #1
sub r1, #1
// r12 = num residual bytes
and r12, r2, #2
.Lrbyte_copy:
subs r12, #1
ldrcsb r3, [r1], #-1
strcsb r3, [r0], #-1
bhi .Lrbyte_copy
// r12 = num residual words
lsr r12, r2, #2
and r12, #3
// Move to the beginning of the current word
sub r0, #3
sub r1, #3
.Lrword_copy:
subs r12, #1
ldrcs r3, [r1], #-4
strcs r3, [r0], #-4
bhi .Lrword_copy
// r2 = num chunks
lsr r2, #5
// Move to the beginning of the current chunk
sub r0, #28
sub r1, #28
// Preserve local variables
push {r4-r10}
.Lrchunk_copy:
subs r2, #1
ldmcsda r1!, {r3-r10}
stmcsda r0!, {r3-r10}
bhi .Lrchunk_copy
// Restore local variables
pop {r4-r10}
bx lr
// Performs a copy a halfword at a time, in reverse.
.global mtl_rdumbcpy16
.type mtl_rdumbcpy16 STT_FUNC
mtl_rdumbcpy16:
// Move to last byte of src and dst
add r0, r2
add r1, r2
sub r0, #1
sub r1, #1
// r12 = has residual byte
and r12, r2, #1
// Copy residual byte if there is one
ldrneb r3, [r1], #-1
strneb r3, [r0], #-1
// r2 = num of half words
lsrs r2, #1
.Lrhword_copy:
subs r2, #1
ldrcsh r3, [r1], #-2
strcsh r3, [r0], #-2
bhi .Lrhword_copy
bx lr
// Performs a generic byte-by-byte memcpy in reverse.
// This allows a safe copy when the dst and src overlap,
// and the destination is after the source
.global mtl_rdumbcpy
.type mtl_rdumbcpy STT_FUNC
mtl_rdumbcpy:
add r0, r2
add r1, r2
sub r0, #1
sub r1, #1
.Lcpy:
subs r2, #1
ldrcsb r3, [r1], #-1
strcsb r3, [r0], #-1
bhi .Lcpy
bx lr
// Performs a reverse copy, choosing a function depending on the alignment
.global mtl_rhybridcpy
.type mtl_rhybridcpy STT_FUNC
mtl_rhybridcpy:
orr r3, r0, r1
// Dumb copy if only byte aligned, discard result
ands r12, r3, #1
bne mtl_rdumbcpy
// Dumb copy by halfword if only halfword aligned
ands r12, r3, #2
bne mtl_rdumbcpy16
// Otherwise it's safe to copy by word
b mtl_rmemcpy32
.global mtl_hybridmove
.type mtl_hybridmove STT_FUNC
mtl_hybridmove:
cmp r0, r1
bhi mtl_rhybridcpy
blo mtl_hybridcpy
// src and dst are the same, no need to copy
bx lr