// Load as ARM instructions into GBA IWRAM section.
// IWRAM section has a 32bit buffer instead of 16bit, so it is
// able to load arm instructions faster.
.section .iwram, "ax", %progbits
.arm
.align 2

// Copies num_byte from dst to src.
// dst and src !! MUST BE WORD ALIGNED !!
// however, num_byte does not need to be aligned
// r0, r1: dst, src
// r2: num_byte
.global mtl_memcpy32
.type mtl_memcpy32 STT_FUNC
mtl_memcpy32:
// r12 = num_residual_byte
and    r12, r2, #31
// r2 = num_chunk
lsrs   r2, r2, #5
// Skip chunk copy if there are no chunks
beq    .Lword_process
.Lchunk_process:
// Preserve local variables of calling function
push    {r4-r10}
.Lchunk_copy:
	// Load 8 word chunk from src into registers, increment src after
	ldmia   r1!, {r3-r10}
	// Store 8 word chunk from register into dst, increment dst after
	stmia   r0!, {r3-r10}
	// Copy again if more than zero chunks left
	subs    r2, r2, #1
	bhi     .Lchunk_copy
// Restore local variables of calling function
pop     {r4-r10}
.Lword_process:
// r2 = num_word
lsrs    r2, r12, #2
// Skip word copy if there are no words
beq     .Lbyte_process
.Lword_copy:
	// Load word from src into register, increment src after
	ldr    r3, [r1], #4
	// Store word from register into dst, increment dst after
	str    r3, [r0], #4
	// Copy again if more than zero words left
	subs   r2, r2, #1
	bhi    .Lword_copy
.Lbyte_process:
// r12 = num_residual_byte
and     r12, r12, #3
.Lbyte_copy:
	// Decrement byte count ahead of time, by checking the carry bit
	// during load/store/branch, we can avoid an unnecessary branch.
	// !REMEMBER, carry = !borrow. Carry is set when subtraction does
	// not underflow. IE. num of bytes was > 0
	subs    r12, r12, #1
	// Load byte from src into register, increment src after
	ldrcsb  r3, [r1], #1
	// Load byte from register into dst, increment dst after
	strcsb  r3, [r0], #1
	// Copy again if more bytes left, checks for zero and carry bit set
	bhi     .Lbyte_copy
// Return
bx lr

.global mtl_dumbcpy16
.type mtl_dumbcpy16 STT_FUNC
mtl_dumbcpy16:
// r12 = has residual byte
and	r12, r2, #1
// r2 = num_hword
lsr	r2, r2, #1
.Lhword_copy:
	// Copy half words
	subs	r2, r2, #1
	ldrcsh	r3, [r1], #2
	strcsh	r3, [r0], #2
	bhi	.Lhword_copy
// Copy residual byte if needed
cmp	r12, #0
ldrneb	r3, [r1]
strneb	r3, [r0]
bx	lr

// Performs a generic byte-by-byte memcpy.
// Still faster than std::memcpy due to being put in IWRAM
// r0, r1: dst, src
// r2: num_bytes
.global mtl_dumbcpy
.type mtl_dumbcpy STT_FUNC
mtl_dumbcpy:
	subs	r2, r2, #1
	ldrcsb	r3, [r1], #1
	strcsb	r3, [r0], #1
	bhi	mtl_dumbcpy
bx	lr

// Calls mtl_memcpy32 if src and dst are word aligned,
// otherwise calls mtl_dumbcpy
.global mtl_hybridcpy
.type mtl_hybridcpy STT_FUNC
mtl_hybridcpy:
orr	r3, r0, r1
ands	r12, r3, #1
bne	mtl_dumbcpy
ands	r12, r3, #2
bne	mtl_dumbcpy16
b	mtl_memcpy32

// asdf
.global mtl_rmemcpy32
.type mtl_rmemcpy32 STT_FUNC
mtl_rmemcpy32:
// Move to last byte of src and dst
add	r0, r2
add	r1, r2
sub	r0, #1
sub	r1, #1
// r12 = num residual bytes
and	r12, r2, #2
.Lrbyte_copy:
	subs	r12, #1
	ldrcsb	r3, [r1], #-1
	strcsb	r3, [r0], #-1
	bhi	.Lrbyte_copy
// r12 = num residual words
lsr	r12, r2, #2
and	r12, #3
// Move to the beginning of the current word
sub	r0, #3
sub	r1, #3
.Lrword_copy:
	subs	r12, #1
	ldrcs	r3, [r1], #-4
	strcs	r3, [r0], #-4
	bhi	.Lrword_copy
// r2 = num chunks
lsr	r2, #5
// Move to the beginning of the current chunk
sub	r0, #28
sub	r1, #28
// Preserve local variables
push	{r4-r10}
.Lrchunk_copy:
	subs	r2, #1
	ldmcsda	r1!, {r3-r10}
	stmcsda	r0!, {r3-r10}
	bhi	.Lrchunk_copy
// Restore local variables
pop	{r4-r10}
bx	lr


// Performs a copy a halfword at a time, in reverse.
.global mtl_rdumbcpy16
.type mtl_rdumbcpy16 STT_FUNC
mtl_rdumbcpy16:
// Move to last byte of src and dst
add	r0, r2
add	r1, r2
sub	r0, #1
sub	r1, #1
// r12 = has residual byte
and 	r12, r2, #1
// Copy residual byte if there is one
ldrneb	r3, [r1], #-1
strneb	r3, [r0], #-1
// r2 = num of half words
lsrs	r2, #1
.Lrhword_copy:
	subs	r2, #1
	ldrcsh	r3, [r1], #-2
	strcsh	r3, [r0], #-2
	bhi	.Lrhword_copy
bx	lr


// Performs a generic byte-by-byte memcpy in reverse.
// This allows a safe copy when the dst and src overlap,
// and the destination is after the source
.global mtl_rdumbcpy
.type mtl_rdumbcpy STT_FUNC
mtl_rdumbcpy:
add	r0, r2
add	r1, r2
sub	r0, #1
sub	r1, #1
.Lcpy:
	subs	r2, #1
	ldrcsb	r3, [r1], #-1
	strcsb	r3, [r0], #-1
	bhi	.Lcpy
bx	lr

// Performs a reverse copy, choosing a function depending on the alignment
.global mtl_rhybridcpy
.type mtl_rhybridcpy STT_FUNC
mtl_rhybridcpy:
orr	r3, r0, r1
// Dumb copy if only byte aligned, discard result
ands	r12, r3, #1
bne	mtl_rdumbcpy
// Dumb copy by halfword if only halfword aligned
ands	r12, r3, #2
bne	mtl_rdumbcpy16
// Otherwise it's safe to copy by word
b	mtl_rmemcpy32

.global mtl_hybridmove
.type mtl_hybridmove STT_FUNC
mtl_hybridmove:
cmp	r0, r1
bhi	mtl_rhybridcpy
blo	mtl_hybridcpy
// src and dst are the same, no need to copy
bx	lr