// Load as ARM instructions into GBA IWRAM section. // IWRAM section has a 32bit buffer instead of 16bit, so it is // able to load arm instructions faster. .section .iwram, "ax", %progbits .arm .align 2 // Copies num_byte from dst to src. // dst and src !! MUST BE WORD ALIGNED !! // however, num_byte does not need to be aligned // r0, r1: dst, src // r2: num_byte .global mtl_memcpy32 .type mtl_memcpy32 STT_FUNC mtl_memcpy32: // r12 = num_residual_byte and r12, r2, #31 // r2 = num_chunk lsrs r2, r2, #5 // Skip chunk copy if there are no chunks beq .Lword_process .Lchunk_process: // Preserve local variables of calling function push {r4-r10} .Lchunk_copy: // Load 8 word chunk from src into registers, increment src after ldmia r1!, {r3-r10} // Store 8 word chunk from register into dst, increment dst after stmia r0!, {r3-r10} // Copy again if more than zero chunks left subs r2, r2, #1 bhi .Lchunk_copy // Restore local variables of calling function pop {r4-r10} .Lword_process: // r2 = num_word lsrs r2, r12, #2 // Skip word copy if there are no words beq .Lbyte_process .Lword_copy: // Load word from src into register, increment src after ldr r3, [r1], #4 // Store word from register into dst, increment dst after str r3, [r0], #4 // Copy again if more than zero words left subs r2, r2, #1 bhi .Lword_copy .Lbyte_process: // r12 = num_residual_byte and r12, r12, #3 .Lbyte_copy: // Decrement byte count ahead of time, by checking the carry bit // during load/store/branch, we can avoid an unnecessary branch. // !REMEMBER, carry = !borrow. Carry is set when subtraction does // not underflow. IE. num of bytes was > 0 subs r12, r12, #1 // Load byte from src into register, increment src after ldrcsb r3, [r1], #1 // Load byte from register into dst, increment dst after strcsb r3, [r0], #1 // Copy again if more bytes left, checks for zero and carry bit set bhi .Lbyte_copy // Return bx lr .global mtl_dumbcpy16 .type mtl_dumbcpy16 STT_FUNC mtl_dumbcpy16: // r12 = has residual byte and r12, r2, #1 // r2 = num_hword lsr r2, r2, #1 .Lhword_copy: // Copy half words subs r2, r2, #1 ldrcsh r3, [r1], #2 strcsh r3, [r0], #2 bhi .Lhword_copy // Copy residual byte if needed cmp r12, #0 ldrneb r3, [r1] strneb r3, [r0] bx lr // Performs a generic byte-by-byte memcpy. // Still faster than std::memcpy due to being put in IWRAM // r0, r1: dst, src // r2: num_bytes .global mtl_dumbcpy .type mtl_dumbcpy STT_FUNC mtl_dumbcpy: subs r2, r2, #1 ldrcsb r3, [r1], #1 strcsb r3, [r0], #1 bhi mtl_dumbcpy bx lr // Calls mtl_memcpy32 if src and dst are word aligned, // otherwise calls mtl_dumbcpy .global mtl_hybridcpy .type mtl_hybridcpy STT_FUNC mtl_hybridcpy: orr r3, r0, r1 ands r12, r3, #1 bne mtl_dumbcpy ands r12, r3, #2 bne mtl_dumbcpy16 b mtl_memcpy32 // asdf .global mtl_rmemcpy32 .type mtl_rmemcpy32 STT_FUNC mtl_rmemcpy32: // Move to last byte of src and dst add r0, r2 add r1, r2 sub r0, #1 sub r1, #1 // r12 = num residual bytes and r12, r2, #2 .Lrbyte_copy: subs r12, #1 ldrcsb r3, [r1], #-1 strcsb r3, [r0], #-1 bhi .Lrbyte_copy // r12 = num residual words lsr r12, r2, #2 and r12, #3 // Move to the beginning of the current word sub r0, #3 sub r1, #3 .Lrword_copy: subs r12, #1 ldrcs r3, [r1], #-4 strcs r3, [r0], #-4 bhi .Lrword_copy // r2 = num chunks lsr r2, #5 // Move to the beginning of the current chunk sub r0, #28 sub r1, #28 // Preserve local variables push {r4-r10} .Lrchunk_copy: subs r2, #1 ldmcsda r1!, {r3-r10} stmcsda r0!, {r3-r10} bhi .Lrchunk_copy // Restore local variables pop {r4-r10} bx lr // Performs a copy a halfword at a time, in reverse. .global mtl_rdumbcpy16 .type mtl_rdumbcpy16 STT_FUNC mtl_rdumbcpy16: // Move to last byte of src and dst add r0, r2 add r1, r2 sub r0, #1 sub r1, #1 // r12 = has residual byte and r12, r2, #1 // Copy residual byte if there is one ldrneb r3, [r1], #-1 strneb r3, [r0], #-1 // r2 = num of half words lsrs r2, #1 .Lrhword_copy: subs r2, #1 ldrcsh r3, [r1], #-2 strcsh r3, [r0], #-2 bhi .Lrhword_copy bx lr // Performs a generic byte-by-byte memcpy in reverse. // This allows a safe copy when the dst and src overlap, // and the destination is after the source .global mtl_rdumbcpy .type mtl_rdumbcpy STT_FUNC mtl_rdumbcpy: add r0, r2 add r1, r2 sub r0, #1 sub r1, #1 .Lcpy: subs r2, #1 ldrcsb r3, [r1], #-1 strcsb r3, [r0], #-1 bhi .Lcpy bx lr // Performs a reverse copy, choosing a function depending on the alignment .global mtl_rhybridcpy .type mtl_rhybridcpy STT_FUNC mtl_rhybridcpy: orr r3, r0, r1 // Dumb copy if only byte aligned, discard result ands r12, r3, #1 bne mtl_rdumbcpy // Dumb copy by halfword if only halfword aligned ands r12, r3, #2 bne mtl_rdumbcpy16 // Otherwise it's safe to copy by word b mtl_rmemcpy32 .global mtl_hybridmove .type mtl_hybridmove STT_FUNC mtl_hybridmove: cmp r0, r1 bhi mtl_rhybridcpy blo mtl_hybridcpy // src and dst are the same, no need to copy bx lr