Remove conditional fixed point number inlining

Caused issues with ODR rule violations. Now fixed point numbers should only be used in ARM-mode. Attempting to use them in Thumb-mode will cause a compilation failure. This commit also moves operator/ into IWRAM on the GBA.
2024-07-30 11:45:08 -06:00 · 2024-07-30 11:45:08 -06:00 · 2181557d9d
commit 2181557d9d
parent 62da9d03c1
2 changed files with 93 additions and 74 deletions
--- a/include/mtl/fixed.hpp
+++ b/include/mtl/fixed.hpp
@ -5,6 +5,8 @@

 #include "mtl/target.hpp"

+TARGET_ARM_MODE
+
 namespace mtl {
 /**
 * \brief 32-bit Fixed point number
@ -19,14 +21,16 @@ namespace mtl {
 *
 * \par ARM
 *
- * All member functions are compiled in ARM mode because some operators (notably
- * multiplication and division) use ARM-only instructions. For optimal performance,
- * fixed point numbers should be used in ARM-mode code to enable inlining. To ensure
- * inlining is enabled, enclose the include directive in `TARGET_ARM_MODE` and
- * `TARGET_END_MODE` from `<mtl/target.hpp>`. This is necessary because inline assembly
- * is used and GCC can't tell that ARM-only instructions are used, so it tries
- * to inline in Thumb mode too. If these directives are not used, some operations
- * will not be inlined even in arm mode (ex. multiplication and division).
+ * All functions are compiled in ARM mode because some operators (notably
+ * multiplication and division) use ARM-only instructions. For compatability
+ * and optimal performance, fixed point numbers should only be used in ARM-mode
+ * code. If `operator*` is used in Thumb code, compilation will fail.
+ * This happens because GCC attempts to inline the function even though it
+ * cannot be inlined in Thumb-mode. Conditional inlining using TARGET_*_MODE
+ * is not used because it is fragile, for example, when including into `<vec4.hpp>`
+ * and also in `foo.cpp`. In this case, `vec4` would attempt to include the
+ * inlined version but `foo` would not, causing a ODR violation. All other
+ * operations are usable from Thumb-mode, with a significant performance penalty.
 */
 class fixed {
 private:
@ -46,10 +50,10 @@ private:
 	 * DO NOT use to set the fixed number to an integer value, use
 	 * the public constructor instead.
 	 */
-	ARM_MODE constexpr fixed(int32_t _x, bool) : x(_x) {}
+	constexpr fixed(int32_t _x, bool) : x(_x) {}

 public:
-	ARM_MODE constexpr fixed() : x(0) {}
+	constexpr fixed() : x(0) {}
 	/**
 	 * \brief Integer constructor
 	 *
@ -58,7 +62,7 @@ public:
 	 * the class description for more detail.
 	 */
 	template <typename T, std::enable_if_t<std::is_integral_v<T>, bool> = true>
-	ARM_MODE constexpr fixed(T _i) : x(_i * 64) {}
+	constexpr fixed(T _i) : x(_i * 64) {}
 	/**
 	 * \brief Floating point constructor
 	 *
@ -71,7 +75,7 @@ public:
 	 * float.
 	 */
 	template <typename T, std::enable_if_t<std::is_floating_point_v<T>, bool> = true>
-	ARM_MODE constexpr fixed(T _f)
+	constexpr fixed(T _f)
 		// 0.5 offset accounts for truncating to integer, round instead
 		: x((_f * 64) + 0.5f) {}

@ -84,7 +88,7 @@ public:
 	 *
 	 * Should not be used unless absolutely needed.
 	 */
-	ARM_MODE static constexpr fixed from_raw(int32_t x) {
+	static constexpr fixed from_raw(int32_t x) {
 		return fixed(x, true);
 	}

@ -94,7 +98,7 @@ public:
 	 * Gets the raw value of the fixed point number. i.e. The fixed point
 	 * number multiplied by 64.
 	 */
-	ARM_MODE constexpr int32_t raw() const {
+	constexpr int32_t raw() const {
 		return x;
 	}

@ -104,13 +108,13 @@ public:
 	 * Addition with fixed point numbers is the same as with a 32-bit
 	 * integer, so should be extremely quick.
 	 */
-	ARM_MODE constexpr fixed operator+(fixed rhs) const {
+	constexpr fixed operator+(fixed rhs) const {
 		return from_raw(x + rhs.x);
 	}
 	/**
 	 * \brief Fixed point subtraction
 	 */
-	ARM_MODE constexpr fixed operator-(fixed rhs) const {
+	constexpr fixed operator-(fixed rhs) const {
 		return from_raw(x - rhs.x);
 	}

@ -118,13 +122,13 @@ public:
 	 * \brief Fixed point multiplication
 	 *
 	 * Uses an assembly implementation to multiply the two numbers.
+	 *
+	 * \par ARM
+	 *
+	 * Use in ARM-mode only. Attempted use in Thumb-mode will cause a
+	 * compilation failure.
 	 */
-#ifdef __ARM_32BIT_STATE        // Safe to inline in ARM mode, but not in Thumb mode
-	ALWAYS_INLINE           // because ARM-mode instructions are used. GCC isn't smart
-#else                           // enough to figure it out on its own
-	NOINLINE
-#endif
-	ARM_MODE fixed operator*(fixed rhs) const {
+	fixed operator*(fixed rhs) const {
 		int32_t raw_result;
 		asm(
 				"smull	r8, r9, %[a], %[b];"
@ -149,59 +153,15 @@ public:
 	 * that if a denominator slowly approaches zero, once it reaches zero
 	 * the quotient's sign will flip. The largest value is used because fixed
 	 * point numbers don't have a representation of infinity.
+	 *
+	 * \par GBA
+	 *
+	 * Placed in IWRAM
 	 */
-#ifdef __ARM_32BIT_STATE        // Safe to inline in ARM mode, but not in Thumb mode
-	ALWAYS_INLINE           // because ARM-mode instructions are used. GCC isn't smart
-#else                           // enough to figure it out on its own
-	NOINLINE
-#endif
-	ARM_MODE fixed operator/(fixed rhs) const {
-		int32_t raw_result;
-		asm(
-				// This division implementation has two methods it can use.
-				// The fastest uses a left shift followed by a single division. The value is shifted
-				// first to preserve the decimal part. Unfortunately, this means large numerators
-				// will cause the operation to overflow. In this case, a compatible method will be
-				// used. This method uses two divisions, one to calculate the integral quotient,
-				// and one to calculate the decimal part. Both these methods work for negative numbers as well.
-				"movs	r1, %[d];"            // Load numerator and denominator, and check if negative or zero
-				"beq	4f;"
-				"movs	r0, %[n];"
-				"blt	1f;"
-				"tst	r0, #0x7e000000;"     // Check if the numerator is large enough to overflow
-				"bne	3f;"
-				"b	2f;"
-				"1:"	// check_negative
-				"mvn	r2, r0;"              // Check if the numerator is large enough to overflow.
-				"tst	r2, #0x7e000000;"
-				"bne	3f;"
-				"2:"	// fast_div           // Fast method
-				"lsl	r0, #6;"              // Shift first to avoid truncation
-				"swi	#0x60000;"            // GBA Div syscall
-				"mov	%[res], r0;"
-				"b	5f;"
-				"3:"	// compat_div         // Compatible method
-				"swi	#0x60000;"            // Compute quotient and shift
-				"lsl	r2, r0, #6;" 
-				"mov	r0, r1;"              // Div syscall puts the modulus in r1, use it as the numerator
-				"lsr	r1, %[d], #6;"        // Load the denominator again, shifted right to calculate decimal part
-				"swi	#0x60000;"
-				"mov	%[res], r2;"          // Calculate the final result
-				"add	%[res], r0;"
-				"b	5f;"
-				"4:"	// zero_div
-				"teq	%[n], %[d];"          // Set result to largest possible negative/positive value.
-				"movmi	%[res], #0x80000000;"
-				"movpl	%[res], #0x7FFFFFFF;"
-				"5:"
-				: [res] "=r" (raw_result)
-				: [n] "r" (x),
-				  [d] "r" (rhs.x)
-				:  "r0", "r1", "r2", "r3"
-		   );
-
-		return from_raw(raw_result);
-	}
+	fixed operator/(fixed rhs) const;
 };

 } // namespace mtl
+
+TARGET_END_MODE
+
--- a/src/gba/fixed.cpp
+++ b/src/gba/fixed.cpp
@ -0,0 +1,59 @@
+#include "mtl/target.hpp"
+
+#include "mtl/fixed.hpp"
+
+TARGET_ARM_MODE
+
+namespace mtl {
+
+GBA_IWRAM fixed fixed::operator/(fixed rhs) const {
+	int32_t raw_result;
+	asm(
+			// This division implementation has two methods it can use.
+			// The fastest uses a left shift followed by a single division. The value is shifted
+			// first to preserve the decimal part. Unfortunately, this means large numerators
+			// will cause the operation to overflow. In this case, a compatible method will be
+			// used. This method uses two divisions, one to calculate the integral quotient,
+			// and one to calculate the decimal part. Both these methods work for negative numbers as well.
+			"movs	r1, %[d];"            // Load numerator and denominator, and check if negative or zero
+			"beq	4f;"
+			"movs	r0, %[n];"
+			"blt	1f;"
+			"tst	r0, #0x7e000000;"     // Check if the numerator is large enough to overflow
+			"bne	3f;"
+			"b	2f;"
+			"1:"	// check_negative
+			"mvn	r2, r0;"              // Check if the numerator is large enough to overflow.
+			"tst	r2, #0x7e000000;"
+			"bne	3f;"
+			"2:"	// fast_div           // Fast method
+			"lsl	r0, #6;"              // Shift first to avoid truncation
+			"swi	#0x60000;"            // GBA Div syscall
+			"mov	%[res], r0;"
+			"b	5f;"
+			"3:"	// compat_div         // Compatible method
+			"swi	#0x60000;"            // Compute quotient and shift
+			"lsl	r2, r0, #6;" 
+			"mov	r0, r1;"              // Div syscall puts the modulus in r1, use it as the numerator
+			"lsr	r1, %[d], #6;"        // Load the denominator again, shifted right to calculate decimal part
+			"swi	#0x60000;"
+			"mov	%[res], r2;"          // Calculate the final result
+			"add	%[res], r0;"
+			"b	5f;"
+			"4:"	// zero_div
+			"teq	%[n], %[d];"          // Set result to largest possible negative/positive value.
+			"movmi	%[res], #0x80000000;"
+			"movpl	%[res], #0x7FFFFFFF;"
+			"5:"
+			: [res] "=r" (raw_result)
+			: [n] "r" (x),
+			[d] "r" (rhs.x)
+			   :  "r0", "r1", "r2", "r3"
+				   );
+	return raw_result;
+}
+
+} // namespace mtl
+
+TARGET_END_MODE
+