diff --git a/include/mtl/fixed.hpp b/include/mtl/fixed.hpp
index 92efa59..f4b708d 100644
--- a/include/mtl/fixed.hpp
+++ b/include/mtl/fixed.hpp
@@ -1,13 +1,9 @@
 #pragma once
 
 #include <cstdint>
+#include <type_traits>
 
-/**
- * \brief Fixed point multiply assembly implementation
- *
- * DO NOT USE DIRECTLY! Use fixed::operator* instead
- */
-extern "C" int32_t mtl_fixed_mul(int32_t x, int32_t y);
+#include "mtl/target.hpp"
 
 namespace mtl {
 /**
@@ -20,63 +16,185 @@ namespace mtl {
  *
  * Has a maximum error of +/- 1/128 (~0.0078), integers are always
  * exactly.
+ *
+ * \par ARM
+ *
+ * All member functions are compiled in ARM mode because some operators (notably
+ * multiplication and division) use ARM-only instructions. For optimal performance,
+ * fixed point numbers should be used in ARM-mode code to enable inlining. To ensure
+ * inlining is enabled, enclose the include directive in `TARGET_ARM_MODE` and
+ * `TARGET_END_MODE` from `<mtl/target.hpp>`. This is necessary because inline assembly
+ * is used and GCC can't tell that ARM-only instructions are used, so it tries
+ * to inline in Thumb mode too. If these directives are not used, some operations
+ * will not be inlined even in arm mode (ex. multiplication and division).
  */
 class fixed {
+private:
 	int32_t x;
 
 	/**
 	 * \brief Raw constructor
 	 *
 	 * Creates a new fixed point number with the raw data of x.
+	 *
+	 * \note
+	 *
+	 * DO NOT USE DIRECTLY. Use `from_raw` instead.
+	 *
+	 * \note
+	 *
 	 * DO NOT use to set the fixed number to an integer value, use
 	 * the public constructor instead.
 	 */
-	constexpr fixed(int32_t _x, bool) : x(_x) {}
+	ARM_MODE constexpr fixed(int32_t _x, bool) : x(_x) {}
 
 public:
-	constexpr fixed() : x(0) {}
+	ARM_MODE constexpr fixed() : x(0) {}
 	/**
-	 * \brief 32-bit integer constructor
+	 * \brief Integer constructor
 	 *
 	 * Creates a new fixed point number with the value of the integer.
 	 * Must be within the range represented by fixed point numbers, see
 	 * the class description for more detail.
 	 */
-	constexpr fixed(int32_t _i) : x(_i * 64) {}
+	template <typename T, std::enable_if_t<std::is_integral_v<T>, bool> = true>
+	ARM_MODE constexpr fixed(T _i) : x(_i * 64) {}
 	/**
 	 * \brief Floating point constructor
 	 *
 	 * Creates a new fixed point number with the closest number to
 	 * the floating point number. Must be within the range represented by
 	 * fixed point numbers, see the class description for more detail.
+	 *
+	 * Must be implemented as a template with enable_if, otherwise passing
+	 * an int (not int32_t) is ambiguous between the promotion to int32_t and
+	 * float.
 	 */
-	constexpr fixed(float _f)
+	template <typename T, std::enable_if_t<std::is_floating_point_v<T>, bool> = true>
+	ARM_MODE constexpr fixed(T _f)
 		// 0.5 offset accounts for truncating to integer, round instead
 		: x((_f * 64) + 0.5f) {}
-	
+
+	/**
+	 * \brief Raw value factory
+	 *
+	 * Creates a new fixed point number with the raw data of x.
+	 *
+	 * \note
+	 *
+	 * Should not be used unless absolutely needed.
+	 */
+	ARM_MODE static constexpr fixed from_raw(int32_t x) {
+		return fixed(x, true);
+	}
+
+	/**
+	 * \brief Raw value accessor
+	 *
+	 * Gets the raw value of the fixed point number. i.e. The fixed point
+	 * number multiplied by 64.
+	 */
+	ARM_MODE constexpr int32_t raw() const {
+		return x;
+	}
+
 	/**
 	 * \brief Fixed point addition
 	 *
 	 * Addition with fixed point numbers is the same as with a 32-bit
 	 * integer, so should be extremely quick.
 	 */
-	fixed operator+(fixed rhs) const {
-		return fixed(x + rhs.x, true);
+	ARM_MODE constexpr fixed operator+(fixed rhs) const {
+		return from_raw(x + rhs.x);
 	}
 
 	/**
 	 * \brief Fixed point multiplication
 	 *
 	 * Uses an assembly implementation to multiply the two numbers.
-	 * Not as quick as an integer multiplication. Use sparringly.
-	 *
-	 * Tested on the MGBA Gameboy Advance emulator, takes around 70
-	 * cycles when the assembly routine is placed in IWRAM.
-	 * The Gameboy Advance uses an armv7tdmi, and IWRAM is the fastest
-	 * available RAM.
 	 */
-	fixed operator*(fixed rhs) const {
-		return fixed(mtl_fixed_mul(x, rhs.x), true);
+#ifdef __ARM_32BIT_STATE        // Safe to inline in ARM mode, but not in Thumb mode
+	ALWAYS_INLINE           // because ARM-mode instructions are used. GCC isn't smart
+#else                           // enough to figure it out on its own
+	NOINLINE
+#endif
+	ARM_MODE fixed operator*(fixed rhs) const {
+		int32_t raw_result;
+		asm(
+				"smull	r8, r9, %[a], %[b];"
+				"lsr	%[res], r8, #6;"
+				"orr	%[res], r9, lsl #26;"
+				: [res] "=r" (raw_result)
+				: [a] "r" (x),
+				  [b] "r" (rhs.x)
+				: "r8", "r9"
+		   );
+		
+		return from_raw(raw_result);
+	}
+
+	/**
+	 * \brief Fixed point division
+	 *
+	 * Faster for numerators in domain [-0x7FFFF, 0x7FFFF].
+	 *
+	 * On attempted division by zero, the result is set to the largest
+	 * absolute value possible with the same sign as the numerator. This means
+	 * that if a denominator slowly approaches zero, once it reaches zero
+	 * the quotient's sign will flip. The largest value is used because fixed
+	 * point numbers don't have a representation of infinity.
+	 */
+#ifdef __ARM_32BIT_STATE        // Safe to inline in ARM mode, but not in Thumb mode
+	ALWAYS_INLINE           // because ARM-mode instructions are used. GCC isn't smart
+#else                           // enough to figure it out on its own
+	NOINLINE
+#endif
+	ARM_MODE fixed operator/(fixed rhs) const {
+		int32_t raw_result;
+		asm(
+				// This division implementation has two methods it can use.
+				// The fastest uses a left shift followed by a single division. The value is shifted
+				// first to preserve the decimal part. Unfortunately, this means large numerators
+				// will cause the operation to overflow. In this case, a compatible method will be
+				// used. This method uses two divisions, one to calculate the integral quotient,
+				// and one to calculate the decimal part. Both these methods work for negative numbers as well.
+				"movs	r1, %[d];"            // Load numerator and denominator, and check if negative or zero
+				"beq	4f;"
+				"movs	r0, %[n];"
+				"blt	1f;"
+				"tst	r0, #0x7e000000;"     // Check if the numerator is large enough to overflow
+				"bne	3f;"
+				"b	2f;"
+				"1:"	// check_negative
+				"mvn	r2, r0;"              // Check if the numerator is large enough to overflow.
+				"tst	r2, #0x7e000000;"
+				"bne	3f;"
+				"2:"	// fast_div           // Fast method
+				"lsl	r0, #6;"              // Shift first to avoid truncation
+				"swi	#0x60000;"            // GBA Div syscall
+				"mov	%[res], r0;"
+				"b	5f;"
+				"3:"	// compat_div         // Compatible method
+				"swi	#0x60000;"            // Compute quotient and shift
+				"lsl	r2, r0, #6;" 
+				"mov	r0, r1;"              // Div syscall puts the modulus in r1, use it as the numerator
+				"lsr	r1, %[d], #6;"        // Load the denominator again, shifted right to calculate decimal part
+				"swi	#0x60000;"
+				"mov	%[res], r2;"          // Calculate the final result
+				"add	%[res], r0;"
+				"b	5f;"
+				"4:"	// zero_div
+				"teq	%[n], %[d];"          // Set result to largest possible negative/positive value.
+				"movmi	%[res], #0x80000000;"
+				"movpl	%[res], #0x7FFFFFFF;"
+				"5:"
+				: [res] "=r" (raw_result)
+				: [n] "r" (x),
+				  [d] "r" (rhs.x)
+				:  "r0", "r1", "r2", "r3"
+		   );
+
+		return from_raw(raw_result);
 	}
 };
 
diff --git a/src/armv4t/fixed.s b/src/armv4t/fixed.s
deleted file mode 100644
index 88c2682..0000000
--- a/src/armv4t/fixed.s
+++ /dev/null
@@ -1,11 +0,0 @@
-.section .iwram, "ax", %progbits
-.arm
-.align 2
-
-.global mtl_fixed_mul
-.type mtl_fixed_mul STT_FUNC
-mtl_fixed_mul:
-smull	r2, r3, r0, r1
-lsr	r0, r2, #6
-orr	r0, r3, lsl #26
-bx	lr