From 956168658521a5c9dfd42b16c8d2d9899c6b6390 Mon Sep 17 00:00:00 2001
From: Madeline Busig <madeline.busig@outlook.com>
Date: Sun, 28 Jul 2024 19:24:47 -0600
Subject: [PATCH] Optimize and expand fixed point number implementation

Before this commit, fixed point multiplication was implemented using an
assembly routine in a separate translation unit. This commit implements
this routine directly using inline assembly. By doing so, these
operations can be inlined when called from ARM code. Fixed point
division is implemented as well, along with various documentation and
style improvements.
---
 include/mtl/fixed.hpp | 162 ++++++++++++++++++++++++++++++++++++------
 src/armv4t/fixed.s    |  11 ---
 2 files changed, 140 insertions(+), 33 deletions(-)
 delete mode 100644 src/armv4t/fixed.s
diff --git a/include/mtl/fixed.hpp b/include/mtl/fixed.hpp
index 92efa59..f4b708d 100644
--- a/include/mtl/fixed.hpp
+++ b/include/mtl/fixed.hpp
@@ -1,13 +1,9 @@
 #pragma once
 
 #include <cstdint>
+#include <type_traits>
 
-/**
- * \brief Fixed point multiply assembly implementation
- *
- * DO NOT USE DIRECTLY! Use fixed::operator* instead
- */
-extern "C" int32_t mtl_fixed_mul(int32_t x, int32_t y);
+#include "mtl/target.hpp"
 
 namespace mtl {
 /**
@@ -20,63 +16,185 @@ namespace mtl {
  *
  * Has a maximum error of +/- 1/128 (~0.0078), integers are always
  * exactly.
+ *
+ * \par ARM
+ *
+ * All member functions are compiled in ARM mode because some operators (notably
+ * multiplication and division) use ARM-only instructions. For optimal performance,
+ * fixed point numbers should be used in ARM-mode code to enable inlining. To ensure
+ * inlining is enabled, enclose the include directive in `TARGET_ARM_MODE` and
+ * `TARGET_END_MODE` from `<mtl/target.hpp>`. This is necessary because inline assembly
+ * is used and GCC can't tell that ARM-only instructions are used, so it tries
+ * to inline in Thumb mode too. If these directives are not used, some operations
+ * will not be inlined even in arm mode (ex. multiplication and division).
  */
 class fixed {
+private:
 	int32_t x;
 
 	/**
 	 * \brief Raw constructor
 	 *
 	 * Creates a new fixed point number with the raw data of x.
+	 *
+	 * \note
+	 *
+	 * DO NOT USE DIRECTLY. Use `from_raw` instead.
+	 *
+	 * \note
+	 *
 	 * DO NOT use to set the fixed number to an integer value, use
 	 * the public constructor instead.
 	 */
-	constexpr fixed(int32_t _x, bool) : x(_x) {}
+	ARM_MODE constexpr fixed(int32_t _x, bool) : x(_x) {}
 
 public:
-	constexpr fixed() : x(0) {}
+	ARM_MODE constexpr fixed() : x(0) {}
 	/**
-	 * \brief 32-bit integer constructor
+	 * \brief Integer constructor
 	 *
 	 * Creates a new fixed point number with the value of the integer.
 	 * Must be within the range represented by fixed point numbers, see
 	 * the class description for more detail.
 	 */
-	constexpr fixed(int32_t _i) : x(_i * 64) {}
+	template <typename T, std::enable_if_t<std::is_integral_v<T>, bool> = true>
+	ARM_MODE constexpr fixed(T _i) : x(_i * 64) {}
 	/**
 	 * \brief Floating point constructor
 	 *
 	 * Creates a new fixed point number with the closest number to
 	 * the floating point number. Must be within the range represented by
 	 * fixed point numbers, see the class description for more detail.
+	 *
+	 * Must be implemented as a template with enable_if, otherwise passing
+	 * an int (not int32_t) is ambiguous between the promotion to int32_t and
+	 * float.
 	 */
-	constexpr fixed(float _f)
+	template <typename T, std::enable_if_t<std::is_floating_point_v<T>, bool> = true>
+	ARM_MODE constexpr fixed(T _f)
 		// 0.5 offset accounts for truncating to integer, round instead
 		: x((_f * 64) + 0.5f) {}
-	
+
+	/**
+	 * \brief Raw value factory
+	 *
+	 * Creates a new fixed point number with the raw data of x.
+	 *
+	 * \note
+	 *
+	 * Should not be used unless absolutely needed.
+	 */
+	ARM_MODE static constexpr fixed from_raw(int32_t x) {
+		return fixed(x, true);
+	}
+
+	/**
+	 * \brief Raw value accessor
+	 *
+	 * Gets the raw value of the fixed point number. i.e. The fixed point
+	 * number multiplied by 64.
+	 */
+	ARM_MODE constexpr int32_t raw() const {
+		return x;
+	}
+
 	/**
 	 * \brief Fixed point addition
 	 *
 	 * Addition with fixed point numbers is the same as with a 32-bit
 	 * integer, so should be extremely quick.
 	 */
-	fixed operator+(fixed rhs) const {
-		return fixed(x + rhs.x, true);
+	ARM_MODE constexpr fixed operator+(fixed rhs) const {
+		return from_raw(x + rhs.x);
 	}
 
 	/**
 	 * \brief Fixed point multiplication
 	 *
 	 * Uses an assembly implementation to multiply the two numbers.
-	 * Not as quick as an integer multiplication. Use sparringly.
-	 *
-	 * Tested on the MGBA Gameboy Advance emulator, takes around 70
-	 * cycles when the assembly routine is placed in IWRAM.
-	 * The Gameboy Advance uses an armv7tdmi, and IWRAM is the fastest
-	 * available RAM.
 	 */
-	fixed operator*(fixed rhs) const {
-		return fixed(mtl_fixed_mul(x, rhs.x), true);
+#ifdef __ARM_32BIT_STATE        // Safe to inline in ARM mode, but not in Thumb mode
+	ALWAYS_INLINE           // because ARM-mode instructions are used. GCC isn't smart
+#else                           // enough to figure it out on its own
+	NOINLINE
+#endif
+	ARM_MODE fixed operator*(fixed rhs) const {
+		int32_t raw_result;
+		asm(
+				"smull	r8, r9, %[a], %[b];"
+				"lsr	%[res], r8, #6;"
+				"orr	%[res], r9, lsl #26;"
+				: [res] "=r" (raw_result)
+				: [a] "r" (x),
+				  [b] "r" (rhs.x)
+				: "r8", "r9"
+		   );
+		
+		return from_raw(raw_result);
+	}
+
+	/**
+	 * \brief Fixed point division
+	 *
+	 * Faster for numerators in domain [-0x7FFFF, 0x7FFFF].
+	 *
+	 * On attempted division by zero, the result is set to the largest
+	 * absolute value possible with the same sign as the numerator. This means
+	 * that if a denominator slowly approaches zero, once it reaches zero
+	 * the quotient's sign will flip. The largest value is used because fixed
+	 * point numbers don't have a representation of infinity.
+	 */
+#ifdef __ARM_32BIT_STATE        // Safe to inline in ARM mode, but not in Thumb mode
+	ALWAYS_INLINE           // because ARM-mode instructions are used. GCC isn't smart
+#else                           // enough to figure it out on its own
+	NOINLINE
+#endif
+	ARM_MODE fixed operator/(fixed rhs) const {
+		int32_t raw_result;
+		asm(
+				// This division implementation has two methods it can use.
+				// The fastest uses a left shift followed by a single division. The value is shifted
+				// first to preserve the decimal part. Unfortunately, this means large numerators
+				// will cause the operation to overflow. In this case, a compatible method will be
+				// used. This method uses two divisions, one to calculate the integral quotient,
+				// and one to calculate the decimal part. Both these methods work for negative numbers as well.
+				"movs	r1, %[d];"            // Load numerator and denominator, and check if negative or zero
+				"beq	4f;"
+				"movs	r0, %[n];"
+				"blt	1f;"
+				"tst	r0, #0x7e000000;"     // Check if the numerator is large enough to overflow
+				"bne	3f;"
+				"b	2f;"
+				"1:"	// check_negative
+				"mvn	r2, r0;"              // Check if the numerator is large enough to overflow.
+				"tst	r2, #0x7e000000;"
+				"bne	3f;"
+				"2:"	// fast_div           // Fast method
+				"lsl	r0, #6;"              // Shift first to avoid truncation
+				"swi	#0x60000;"            // GBA Div syscall
+				"mov	%[res], r0;"
+				"b	5f;"
+				"3:"	// compat_div         // Compatible method
+				"swi	#0x60000;"            // Compute quotient and shift
+				"lsl	r2, r0, #6;" 
+				"mov	r0, r1;"              // Div syscall puts the modulus in r1, use it as the numerator
+				"lsr	r1, %[d], #6;"        // Load the denominator again, shifted right to calculate decimal part
+				"swi	#0x60000;"
+				"mov	%[res], r2;"          // Calculate the final result
+				"add	%[res], r0;"
+				"b	5f;"
+				"4:"	// zero_div
+				"teq	%[n], %[d];"          // Set result to largest possible negative/positive value.
+				"movmi	%[res], #0x80000000;"
+				"movpl	%[res], #0x7FFFFFFF;"
+				"5:"
+				: [res] "=r" (raw_result)
+				: [n] "r" (x),
+				  [d] "r" (rhs.x)
+				:  "r0", "r1", "r2", "r3"
+		   );
+
+		return from_raw(raw_result);
 	}
 };
 
diff --git a/src/armv4t/fixed.s b/src/armv4t/fixed.s
deleted file mode 100644
index 88c2682..0000000
--- a/src/armv4t/fixed.s
+++ /dev/null
@@ -1,11 +0,0 @@
-.section .iwram, "ax", %progbits
-.arm
-.align 2
-
-.global mtl_fixed_mul
-.type mtl_fixed_mul STT_FUNC
-mtl_fixed_mul:
-smull	r2, r3, r0, r1
-lsr	r0, r2, #6
-orr	r0, r3, lsl #26
-bx	lr