CONFIGURE_DEPENDS option was added in CMake v3.12. This option allows the build system to automatically re-run CMake if the glob changes, solving the major issue with globbing. May have a performance impact, but it should be negligible compared to the time spent building.
218 lines
4.9 KiB
ArmAsm
218 lines
4.9 KiB
ArmAsm
// Load as ARM instructions into GBA IWRAM section.
|
|
// IWRAM section has a 32bit buffer instead of 16bit, so it is
|
|
// able to load arm instructions faster.
|
|
.section .iwram, "ax", %progbits
|
|
.arm
|
|
.align 2
|
|
|
|
// Copies num_byte from dst to src.
|
|
// dst and src !! MUST BE WORD ALIGNED !!
|
|
// however, num_byte does not need to be aligned
|
|
// r0, r1: dst, src
|
|
// r2: num_byte
|
|
.global mtl_memcpy32
|
|
.type mtl_memcpy32 STT_FUNC
|
|
mtl_memcpy32:
|
|
// r12 = num_residual_byte
|
|
and r12, r2, #31
|
|
// r2 = num_chunk
|
|
lsrs r2, r2, #5
|
|
// Skip chunk copy if there are no chunks
|
|
beq .Lword_process
|
|
.Lchunk_process:
|
|
// Preserve local variables of calling function
|
|
push {r4-r10}
|
|
.Lchunk_copy:
|
|
// Load 8 word chunk from src into registers, increment src after
|
|
ldmia r1!, {r3-r10}
|
|
// Store 8 word chunk from register into dst, increment dst after
|
|
stmia r0!, {r3-r10}
|
|
// Copy again if more than zero chunks left
|
|
subs r2, r2, #1
|
|
bhi .Lchunk_copy
|
|
// Restore local variables of calling function
|
|
pop {r4-r10}
|
|
.Lword_process:
|
|
// r2 = num_word
|
|
lsrs r2, r12, #2
|
|
// Skip word copy if there are no words
|
|
beq .Lbyte_process
|
|
.Lword_copy:
|
|
// Load word from src into register, increment src after
|
|
ldr r3, [r1], #4
|
|
// Store word from register into dst, increment dst after
|
|
str r3, [r0], #4
|
|
// Copy again if more than zero words left
|
|
subs r2, r2, #1
|
|
bhi .Lword_copy
|
|
.Lbyte_process:
|
|
// r12 = num_residual_byte
|
|
and r12, r12, #3
|
|
.Lbyte_copy:
|
|
// Decrement byte count ahead of time, by checking the carry bit
|
|
// during load/store/branch, we can avoid an unnecessary branch.
|
|
// !REMEMBER, carry = !borrow. Carry is set when subtraction does
|
|
// not underflow. IE. num of bytes was > 0
|
|
subs r12, r12, #1
|
|
// Load byte from src into register, increment src after
|
|
ldrcsb r3, [r1], #1
|
|
// Load byte from register into dst, increment dst after
|
|
strcsb r3, [r0], #1
|
|
// Copy again if more bytes left, checks for zero and carry bit set
|
|
bhi .Lbyte_copy
|
|
// Return
|
|
bx lr
|
|
|
|
.global mtl_dumbcpy16
|
|
.type mtl_dumbcpy16 STT_FUNC
|
|
mtl_dumbcpy16:
|
|
// r12 = has residual byte
|
|
and r12, r2, #1
|
|
// r2 = num_hword
|
|
lsr r2, r2, #1
|
|
.Lhword_copy:
|
|
// Copy half words
|
|
subs r2, r2, #1
|
|
ldrcsh r3, [r1], #2
|
|
strcsh r3, [r0], #2
|
|
bhi .Lhword_copy
|
|
// Copy residual byte if needed
|
|
cmp r12, #0
|
|
ldrneb r3, [r1]
|
|
strneb r3, [r0]
|
|
bx lr
|
|
|
|
// Performs a generic byte-by-byte memcpy.
|
|
// Still faster than std::memcpy due to being put in IWRAM
|
|
// r0, r1: dst, src
|
|
// r2: num_bytes
|
|
.global mtl_dumbcpy
|
|
.type mtl_dumbcpy STT_FUNC
|
|
mtl_dumbcpy:
|
|
subs r2, r2, #1
|
|
ldrcsb r3, [r1], #1
|
|
strcsb r3, [r0], #1
|
|
bhi mtl_dumbcpy
|
|
bx lr
|
|
|
|
// Calls mtl_memcpy32 if src and dst are word aligned,
|
|
// otherwise calls mtl_dumbcpy
|
|
.global mtl_hybridcpy
|
|
.type mtl_hybridcpy STT_FUNC
|
|
mtl_hybridcpy:
|
|
orr r3, r0, r1
|
|
ands r12, r3, #1
|
|
bne mtl_dumbcpy
|
|
ands r12, r3, #2
|
|
bne mtl_dumbcpy16
|
|
b mtl_memcpy32
|
|
|
|
// asdf
|
|
.global mtl_rmemcpy32
|
|
.type mtl_rmemcpy32 STT_FUNC
|
|
mtl_rmemcpy32:
|
|
// Move to last byte of src and dst
|
|
add r0, r2
|
|
add r1, r2
|
|
sub r0, #1
|
|
sub r1, #1
|
|
// r12 = num residual bytes
|
|
and r12, r2, #2
|
|
.Lrbyte_copy:
|
|
subs r12, #1
|
|
ldrcsb r3, [r1], #-1
|
|
strcsb r3, [r0], #-1
|
|
bhi .Lrbyte_copy
|
|
// r12 = num residual words
|
|
lsr r12, r2, #2
|
|
and r12, #3
|
|
// Move to the beginning of the current word
|
|
sub r0, #3
|
|
sub r1, #3
|
|
.Lrword_copy:
|
|
subs r12, #1
|
|
ldrcs r3, [r1], #-4
|
|
strcs r3, [r0], #-4
|
|
bhi .Lrword_copy
|
|
// r2 = num chunks
|
|
lsr r2, #5
|
|
// Move to the beginning of the current chunk
|
|
sub r0, #28
|
|
sub r1, #28
|
|
// Preserve local variables
|
|
push {r4-r10}
|
|
.Lrchunk_copy:
|
|
subs r2, #1
|
|
ldmcsda r1!, {r3-r10}
|
|
stmcsda r0!, {r3-r10}
|
|
bhi .Lrchunk_copy
|
|
// Restore local variables
|
|
pop {r4-r10}
|
|
bx lr
|
|
|
|
|
|
// Performs a copy a halfword at a time, in reverse.
|
|
.global mtl_rdumbcpy16
|
|
.type mtl_rdumbcpy16 STT_FUNC
|
|
mtl_rdumbcpy16:
|
|
// Move to last byte of src and dst
|
|
add r0, r2
|
|
add r1, r2
|
|
sub r0, #1
|
|
sub r1, #1
|
|
// r12 = has residual byte
|
|
and r12, r2, #1
|
|
// Copy residual byte if there is one
|
|
ldrneb r3, [r1], #-1
|
|
strneb r3, [r0], #-1
|
|
// r2 = num of half words
|
|
lsrs r2, #1
|
|
.Lrhword_copy:
|
|
subs r2, #1
|
|
ldrcsh r3, [r1], #-2
|
|
strcsh r3, [r0], #-2
|
|
bhi .Lrhword_copy
|
|
bx lr
|
|
|
|
|
|
// Performs a generic byte-by-byte memcpy in reverse.
|
|
// This allows a safe copy when the dst and src overlap,
|
|
// and the destination is after the source
|
|
.global mtl_rdumbcpy
|
|
.type mtl_rdumbcpy STT_FUNC
|
|
mtl_rdumbcpy:
|
|
add r0, r2
|
|
add r1, r2
|
|
sub r0, #1
|
|
sub r1, #1
|
|
.Lcpy:
|
|
subs r2, #1
|
|
ldrcsb r3, [r1], #-1
|
|
strcsb r3, [r0], #-1
|
|
bhi .Lcpy
|
|
bx lr
|
|
|
|
// Performs a reverse copy, choosing a function depending on the alignment
|
|
.global mtl_rhybridcpy
|
|
.type mtl_rhybridcpy STT_FUNC
|
|
mtl_rhybridcpy:
|
|
orr r3, r0, r1
|
|
// Dumb copy if only byte aligned, discard result
|
|
ands r12, r3, #1
|
|
bne mtl_rdumbcpy
|
|
// Dumb copy by halfword if only halfword aligned
|
|
ands r12, r3, #2
|
|
bne mtl_rdumbcpy16
|
|
// Otherwise it's safe to copy by word
|
|
b mtl_rmemcpy32
|
|
|
|
.global mtl_hybridmove
|
|
.type mtl_hybridmove STT_FUNC
|
|
mtl_hybridmove:
|
|
cmp r0, r1
|
|
bhi mtl_rhybridcpy
|
|
blo mtl_hybridcpy
|
|
// src and dst are the same, no need to copy
|
|
bx lr
|
|
|