Fix inaccurate implementation of udiv100000 and udiv1000000000

Both assembly macros failed when given large numbers ending in 9. For example, udiv100000 of 3999999999 produced 40000 instead of 39999. Similarly, udiv1000000000 of 3999999999 produced 4 instead of 3. Both of the previous implementations failed the Granlund-Montgomery integer division algorithm. This commit replaces these macros with the correct implementation generated by clang for a constant integer division. I do not understand how this implementation works. All other macros do pass the Granlund-Montgomery algorithm.
2024-04-01 19:45:51 -06:00 · 2024-04-01 19:45:51 -06:00 · d1155befdb
commit d1155befdb
parent 1b48b5ec80
1 changed files with 20 additions and 5 deletions
--- a/include/mtl/armv4t/asm/math.s
+++ b/include/mtl/armv4t/asm/math.s
@ -7,7 +7,8 @@
 * 0xCCCCCCCD >> 35 approximately equals 0.1
 * Performs a 64 bit multiply of 0xCCCCCCCD and rx, shifts the high 32 bits
 * by 3, and discards the low bits. This results in a division by 10 that
- * works for all unsigned values of rx
+ * works for all unsigned values of rx. This satifies the constraints of
+ * the Granlund-Montgomery integer division algorithm.
 */
 .macro udiv10 rd, rx, rt
 	ldr	\rt, =0xCCCCCCCD
@ -33,10 +34,20 @@
 	lsrs	\rd, $13
 .endm

+/*
+ * When using the Granlund-Montgomery integer division algorithm, the magic
+ * number produced does not fit inside the int32 range. GM produces:
+ *   m = 0x14F8B588F
+ *   k = 17
+ *
+ * This division uses the output produces by clang for a division by 100000.
+ * I don't understand why it works, but it does.
+ */
 .macro udiv100000 rd, rx, rt
-	ldr	\rt, =0x29f17
+	lsr	\rx, $5
+	ldr	\rt, =0xA7C5AC5
 	umull	\rt, \rd, \rx, \rt
-	lsrs	\rd, $2
+	lsrs	\rd, $7
 .endm

 .macro udiv1000000 rd, rx, rt
@ -57,10 +68,14 @@
 	lsrs	\rd, $25
 .endm

+/*
+ * Same situation as udiv100000
+ */
 .macro udiv1000000000 rd, rx, rt
-	ldr	\rt, =0x44b82fa1
+	lsr	\rx, $9
+	ldr	\rt, =0x44B83
 	umull	\rt, \rd, \rx, \rt
-	lsrs	\rd, $28
+	lsrs	\rd, $7
 .endm

 /*