diff --git a/include/mtl/fixed.hpp b/include/mtl/fixed.hpp index 92efa59..f4b708d 100644 --- a/include/mtl/fixed.hpp +++ b/include/mtl/fixed.hpp @@ -1,13 +1,9 @@ #pragma once #include +#include -/** - * \brief Fixed point multiply assembly implementation - * - * DO NOT USE DIRECTLY! Use fixed::operator* instead - */ -extern "C" int32_t mtl_fixed_mul(int32_t x, int32_t y); +#include "mtl/target.hpp" namespace mtl { /** @@ -20,63 +16,185 @@ namespace mtl { * * Has a maximum error of +/- 1/128 (~0.0078), integers are always * exactly. + * + * \par ARM + * + * All member functions are compiled in ARM mode because some operators (notably + * multiplication and division) use ARM-only instructions. For optimal performance, + * fixed point numbers should be used in ARM-mode code to enable inlining. To ensure + * inlining is enabled, enclose the include directive in `TARGET_ARM_MODE` and + * `TARGET_END_MODE` from ``. This is necessary because inline assembly + * is used and GCC can't tell that ARM-only instructions are used, so it tries + * to inline in Thumb mode too. If these directives are not used, some operations + * will not be inlined even in arm mode (ex. multiplication and division). */ class fixed { +private: int32_t x; /** * \brief Raw constructor * * Creates a new fixed point number with the raw data of x. + * + * \note + * + * DO NOT USE DIRECTLY. Use `from_raw` instead. + * + * \note + * * DO NOT use to set the fixed number to an integer value, use * the public constructor instead. */ - constexpr fixed(int32_t _x, bool) : x(_x) {} + ARM_MODE constexpr fixed(int32_t _x, bool) : x(_x) {} public: - constexpr fixed() : x(0) {} + ARM_MODE constexpr fixed() : x(0) {} /** - * \brief 32-bit integer constructor + * \brief Integer constructor * * Creates a new fixed point number with the value of the integer. * Must be within the range represented by fixed point numbers, see * the class description for more detail. */ - constexpr fixed(int32_t _i) : x(_i * 64) {} + template , bool> = true> + ARM_MODE constexpr fixed(T _i) : x(_i * 64) {} /** * \brief Floating point constructor * * Creates a new fixed point number with the closest number to * the floating point number. Must be within the range represented by * fixed point numbers, see the class description for more detail. + * + * Must be implemented as a template with enable_if, otherwise passing + * an int (not int32_t) is ambiguous between the promotion to int32_t and + * float. */ - constexpr fixed(float _f) + template , bool> = true> + ARM_MODE constexpr fixed(T _f) // 0.5 offset accounts for truncating to integer, round instead : x((_f * 64) + 0.5f) {} - + + /** + * \brief Raw value factory + * + * Creates a new fixed point number with the raw data of x. + * + * \note + * + * Should not be used unless absolutely needed. + */ + ARM_MODE static constexpr fixed from_raw(int32_t x) { + return fixed(x, true); + } + + /** + * \brief Raw value accessor + * + * Gets the raw value of the fixed point number. i.e. The fixed point + * number multiplied by 64. + */ + ARM_MODE constexpr int32_t raw() const { + return x; + } + /** * \brief Fixed point addition * * Addition with fixed point numbers is the same as with a 32-bit * integer, so should be extremely quick. */ - fixed operator+(fixed rhs) const { - return fixed(x + rhs.x, true); + ARM_MODE constexpr fixed operator+(fixed rhs) const { + return from_raw(x + rhs.x); } /** * \brief Fixed point multiplication * * Uses an assembly implementation to multiply the two numbers. - * Not as quick as an integer multiplication. Use sparringly. - * - * Tested on the MGBA Gameboy Advance emulator, takes around 70 - * cycles when the assembly routine is placed in IWRAM. - * The Gameboy Advance uses an armv7tdmi, and IWRAM is the fastest - * available RAM. */ - fixed operator*(fixed rhs) const { - return fixed(mtl_fixed_mul(x, rhs.x), true); +#ifdef __ARM_32BIT_STATE // Safe to inline in ARM mode, but not in Thumb mode + ALWAYS_INLINE // because ARM-mode instructions are used. GCC isn't smart +#else // enough to figure it out on its own + NOINLINE +#endif + ARM_MODE fixed operator*(fixed rhs) const { + int32_t raw_result; + asm( + "smull r8, r9, %[a], %[b];" + "lsr %[res], r8, #6;" + "orr %[res], r9, lsl #26;" + : [res] "=r" (raw_result) + : [a] "r" (x), + [b] "r" (rhs.x) + : "r8", "r9" + ); + + return from_raw(raw_result); + } + + /** + * \brief Fixed point division + * + * Faster for numerators in domain [-0x7FFFF, 0x7FFFF]. + * + * On attempted division by zero, the result is set to the largest + * absolute value possible with the same sign as the numerator. This means + * that if a denominator slowly approaches zero, once it reaches zero + * the quotient's sign will flip. The largest value is used because fixed + * point numbers don't have a representation of infinity. + */ +#ifdef __ARM_32BIT_STATE // Safe to inline in ARM mode, but not in Thumb mode + ALWAYS_INLINE // because ARM-mode instructions are used. GCC isn't smart +#else // enough to figure it out on its own + NOINLINE +#endif + ARM_MODE fixed operator/(fixed rhs) const { + int32_t raw_result; + asm( + // This division implementation has two methods it can use. + // The fastest uses a left shift followed by a single division. The value is shifted + // first to preserve the decimal part. Unfortunately, this means large numerators + // will cause the operation to overflow. In this case, a compatible method will be + // used. This method uses two divisions, one to calculate the integral quotient, + // and one to calculate the decimal part. Both these methods work for negative numbers as well. + "movs r1, %[d];" // Load numerator and denominator, and check if negative or zero + "beq 4f;" + "movs r0, %[n];" + "blt 1f;" + "tst r0, #0x7e000000;" // Check if the numerator is large enough to overflow + "bne 3f;" + "b 2f;" + "1:" // check_negative + "mvn r2, r0;" // Check if the numerator is large enough to overflow. + "tst r2, #0x7e000000;" + "bne 3f;" + "2:" // fast_div // Fast method + "lsl r0, #6;" // Shift first to avoid truncation + "swi #0x60000;" // GBA Div syscall + "mov %[res], r0;" + "b 5f;" + "3:" // compat_div // Compatible method + "swi #0x60000;" // Compute quotient and shift + "lsl r2, r0, #6;" + "mov r0, r1;" // Div syscall puts the modulus in r1, use it as the numerator + "lsr r1, %[d], #6;" // Load the denominator again, shifted right to calculate decimal part + "swi #0x60000;" + "mov %[res], r2;" // Calculate the final result + "add %[res], r0;" + "b 5f;" + "4:" // zero_div + "teq %[n], %[d];" // Set result to largest possible negative/positive value. + "movmi %[res], #0x80000000;" + "movpl %[res], #0x7FFFFFFF;" + "5:" + : [res] "=r" (raw_result) + : [n] "r" (x), + [d] "r" (rhs.x) + : "r0", "r1", "r2", "r3" + ); + + return from_raw(raw_result); } }; diff --git a/src/armv4t/fixed.s b/src/armv4t/fixed.s deleted file mode 100644 index 88c2682..0000000 --- a/src/armv4t/fixed.s +++ /dev/null @@ -1,11 +0,0 @@ -.section .iwram, "ax", %progbits -.arm -.align 2 - -.global mtl_fixed_mul -.type mtl_fixed_mul STT_FUNC -mtl_fixed_mul: -smull r2, r3, r0, r1 -lsr r0, r2, #6 -orr r0, r3, lsl #26 -bx lr