shifting both sides by 16 bit gives pretty good accuracy with fast code:
        movw    r3, #9155
        ldr     r2, .L2
        mul     r0, r3, r0
        lsrs    r0, r0, #1
        umull   r3, r0, r2, r0
        lsrs    r0, r0, #8
.L2:
        .word   144122641
with 984 MHz it allows nsec to be max ~143000  max which is plenty
			
			teensy4-core
		| @@ -758,7 +758,7 @@ static inline void delayNanoseconds(uint32_t) __attribute__((always_inline, unus | |||
| static inline void delayNanoseconds(uint32_t nsec) | |||
| { | |||
| uint32_t begin = ARM_DWT_CYCCNT; | |||
| uint32_t cycles = ((uint64_t)F_CPU_ACTUAL * nsec) / 1000000000UL; | |||
| uint32_t cycles = ((F_CPU_ACTUAL>>16) * nsec) / (1000000000UL>>16); | |||
| while (ARM_DWT_CYCCNT - begin < cycles) ; // wait | |||
| } | |||