Teensy 4.1 core updated for C++20

memcpy-armv7m.S 7.8KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. /*
  2. * Copyright (c) 2013 ARM Ltd
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions
  7. * are met:
  8. * 1. Redistributions of source code must retain the above copyright
  9. * notice, this list of conditions and the following disclaimer.
  10. * 2. Redistributions in binary form must reproduce the above copyright
  11. * notice, this list of conditions and the following disclaimer in the
  12. * documentation and/or other materials provided with the distribution.
  13. * 3. The name of the company may not be used to endorse or promote
  14. * products derived from this software without specific prior written
  15. * permission.
  16. *
  17. * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
  18. * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  19. * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  20. * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  21. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
  22. * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  23. * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  24. * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  25. * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  26. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27. */
  28. #if defined(__MK20DX128__) || defined(__MK20DX256__) || defined(__MK66FX1M0__)
  29. #if defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__)
  30. #define __ARM_FEATURE_UNALIGNED 1
  31. /* This memcpy routine is optimised for Cortex-M3/M4 cores with/without
  32. unaligned access.
  33. If compiled with GCC, this file should be enclosed within following
  34. pre-processing check:
  35. if defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__)
  36. Prototype: void *memcpy (void *dst, const void *src, size_t count);
  37. The job will be done in 5 steps.
  38. Step 1: Align src/dest pointers, copy mis-aligned if fail to align both
  39. Step 2: Repeatedly copy big block size of __OPT_BIG_BLOCK_SIZE
  40. Step 3: Repeatedly copy big block size of __OPT_MID_BLOCK_SIZE
  41. Step 4: Copy word by word
  42. Step 5: Copy byte-to-byte
  43. Tunable options:
  44. __OPT_BIG_BLOCK_SIZE: Size of big block in words. Default to 64.
  45. __OPT_MID_BLOCK_SIZE: Size of big block in words. Default to 16.
  46. */
  47. #ifndef __OPT_BIG_BLOCK_SIZE
  48. #define __OPT_BIG_BLOCK_SIZE (4 * 16)
  49. #endif
  50. #ifndef __OPT_MID_BLOCK_SIZE
  51. #define __OPT_MID_BLOCK_SIZE (4 * 4)
  52. #endif
  53. #if __OPT_BIG_BLOCK_SIZE == 16
  54. #define BEGIN_UNROLL_BIG_BLOCK \
  55. .irp offset, 0,4,8,12
  56. #elif __OPT_BIG_BLOCK_SIZE == 32
  57. #define BEGIN_UNROLL_BIG_BLOCK \
  58. .irp offset, 0,4,8,12,16,20,24,28
  59. #elif __OPT_BIG_BLOCK_SIZE == 64
  60. #define BEGIN_UNROLL_BIG_BLOCK \
  61. .irp offset, 0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60
  62. #else
  63. #error "Illegal __OPT_BIG_BLOCK_SIZE"
  64. #endif
  65. #if __OPT_MID_BLOCK_SIZE == 8
  66. #define BEGIN_UNROLL_MID_BLOCK \
  67. .irp offset, 0,4
  68. #elif __OPT_MID_BLOCK_SIZE == 16
  69. #define BEGIN_UNROLL_MID_BLOCK \
  70. .irp offset, 0,4,8,12
  71. #else
  72. #error "Illegal __OPT_MID_BLOCK_SIZE"
  73. #endif
  74. #define END_UNROLL .endr
  75. .syntax unified
  76. .text
  77. .align 2
  78. .global memcpy
  79. .thumb
  80. .thumb_func
  81. .type memcpy, %function
  82. memcpy:
  83. @ r0: dst
  84. @ r1: src
  85. @ r2: len
  86. #ifdef __ARM_FEATURE_UNALIGNED
  87. /* In case of UNALIGNED access supported, ip is not used in
  88. function body. */
  89. mov ip, r0
  90. #else
  91. push {r0}
  92. #endif
  93. orr r3, r1, r0
  94. ands r3, r3, #3
  95. bne .Lmisaligned_copy
  96. .Lbig_block:
  97. subs r2, __OPT_BIG_BLOCK_SIZE
  98. blo .Lmid_block
  99. /* Kernel loop for big block copy */
  100. .align 2
  101. .Lbig_block_loop:
  102. BEGIN_UNROLL_BIG_BLOCK
  103. #ifdef __ARM_ARCH_7EM__
  104. ldr r3, [r1], #4
  105. str r3, [r0], #4
  106. END_UNROLL
  107. #else /* __ARM_ARCH_7M__ */
  108. ldr r3, [r1, \offset]
  109. str r3, [r0, \offset]
  110. END_UNROLL
  111. adds r0, __OPT_BIG_BLOCK_SIZE
  112. adds r1, __OPT_BIG_BLOCK_SIZE
  113. #endif
  114. subs r2, __OPT_BIG_BLOCK_SIZE
  115. bhs .Lbig_block_loop
  116. .Lmid_block:
  117. adds r2, __OPT_BIG_BLOCK_SIZE - __OPT_MID_BLOCK_SIZE
  118. blo .Lcopy_word_by_word
  119. /* Kernel loop for mid-block copy */
  120. .align 2
  121. .Lmid_block_loop:
  122. BEGIN_UNROLL_MID_BLOCK
  123. #ifdef __ARM_ARCH_7EM__
  124. ldr r3, [r1], #4
  125. str r3, [r0], #4
  126. END_UNROLL
  127. #else /* __ARM_ARCH_7M__ */
  128. ldr r3, [r1, \offset]
  129. str r3, [r0, \offset]
  130. END_UNROLL
  131. adds r0, __OPT_MID_BLOCK_SIZE
  132. adds r1, __OPT_MID_BLOCK_SIZE
  133. #endif
  134. subs r2, __OPT_MID_BLOCK_SIZE
  135. bhs .Lmid_block_loop
  136. .Lcopy_word_by_word:
  137. adds r2, __OPT_MID_BLOCK_SIZE - 4
  138. blo .Lcopy_less_than_4
  139. /* Kernel loop for small block copy */
  140. .align 2
  141. .Lcopy_word_by_word_loop:
  142. ldr r3, [r1], #4
  143. str r3, [r0], #4
  144. subs r2, #4
  145. bhs .Lcopy_word_by_word_loop
  146. .Lcopy_less_than_4:
  147. adds r2, #4
  148. beq .Ldone
  149. lsls r2, r2, #31
  150. itt ne
  151. ldrbne r3, [r1], #1
  152. strbne r3, [r0], #1
  153. bcc .Ldone
  154. #ifdef __ARM_FEATURE_UNALIGNED
  155. ldrh r3, [r1]
  156. strh r3, [r0]
  157. #else
  158. ldrb r3, [r1]
  159. strb r3, [r0]
  160. ldrb r3, [r1, #1]
  161. strb r3, [r0, #1]
  162. #endif /* __ARM_FEATURE_UNALIGNED */
  163. .Ldone:
  164. #ifdef __ARM_FEATURE_UNALIGNED
  165. mov r0, ip
  166. #else
  167. pop {r0}
  168. #endif
  169. bx lr
  170. .align 2
  171. .Lmisaligned_copy:
  172. #ifdef __ARM_FEATURE_UNALIGNED
  173. /* Define label DST_ALIGNED to BIG_BLOCK. It will go to aligned copy
  174. once destination is adjusted to aligned. */
  175. #define Ldst_aligned Lbig_block
  176. /* Copy word by word using LDR when alignment can be done in hardware,
  177. i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */
  178. cmp r2, #8
  179. blo .Lbyte_copy
  180. /* if src is aligned, just go to the big block loop. */
  181. lsls r3, r1, #30
  182. beq .Ldst_aligned
  183. #else
  184. /* if len < 12, misalignment adjustment has more overhead than
  185. just byte-to-byte copy. Also, len must >=8 to guarantee code
  186. afterward work correctly. */
  187. cmp r2, #12
  188. blo .Lbyte_copy
  189. #endif /* __ARM_FEATURE_UNALIGNED */
  190. /* Align dst only, not trying to align src. That is the because
  191. handling of aligned src and misaligned dst need more overhead than
  192. otherwise. By doing this the worst case is when initial src is aligned,
  193. additional up to 4 byte additional copy will executed, which is
  194. acceptable. */
  195. ands r3, r0, #3
  196. beq .Ldst_aligned
  197. rsb r3, #4
  198. subs r2, r3
  199. lsls r3, r3, #31
  200. itt ne
  201. ldrbne r3, [r1], #1
  202. strbne r3, [r0], #1
  203. bcc .Ldst_aligned
  204. #ifdef __ARM_FEATURE_UNALIGNED
  205. ldrh r3, [r1], #2
  206. strh r3, [r0], #2
  207. b .Ldst_aligned
  208. #else
  209. ldrb r3, [r1], #1
  210. strb r3, [r0], #1
  211. ldrb r3, [r1], #1
  212. strb r3, [r0], #1
  213. /* Now that dst is aligned */
  214. .Ldst_aligned:
  215. /* if r1 is aligned now, it means r0/r1 has the same misalignment,
  216. and they are both aligned now. Go aligned copy. */
  217. ands r3, r1, #3
  218. beq .Lbig_block
  219. /* dst is aligned, but src isn't. Misaligned copy. */
  220. push {r4, r5}
  221. subs r2, #4
  222. /* Backward r1 by misaligned bytes, to make r1 aligned.
  223. Since we need to restore r1 to unaligned address after the loop,
  224. we need keep the offset bytes to ip and sub it from r1 afterward. */
  225. subs r1, r3
  226. rsb ip, r3, #4
  227. /* Pre-load on word */
  228. ldr r4, [r1], #4
  229. cmp r3, #2
  230. beq .Lmisaligned_copy_2_2
  231. cmp r3, #3
  232. beq .Lmisaligned_copy_3_1
  233. .macro mis_src_copy shift
  234. 1:
  235. lsrs r4, r4, \shift
  236. ldr r3, [r1], #4
  237. lsls r5, r3, 32-\shift
  238. orr r4, r4, r5
  239. str r4, [r0], #4
  240. mov r4, r3
  241. subs r2, #4
  242. bhs 1b
  243. .endm
  244. .Lmisaligned_copy_1_3:
  245. mis_src_copy shift=8
  246. b .Lsrc_misaligned_tail
  247. .Lmisaligned_copy_3_1:
  248. mis_src_copy shift=24
  249. b .Lsrc_misaligned_tail
  250. .Lmisaligned_copy_2_2:
  251. /* For 2_2 misalignment, ldr is still faster than 2 x ldrh. */
  252. mis_src_copy shift=16
  253. .Lsrc_misaligned_tail:
  254. adds r2, #4
  255. subs r1, ip
  256. pop {r4, r5}
  257. #endif /* __ARM_FEATURE_UNALIGNED */
  258. .Lbyte_copy:
  259. subs r2, #4
  260. blo .Lcopy_less_than_4
  261. .Lbyte_copy_loop:
  262. subs r2, #1
  263. ldrb r3, [r1], #1
  264. strb r3, [r0], #1
  265. bhs .Lbyte_copy_loop
  266. ldrb r3, [r1]
  267. strb r3, [r0]
  268. ldrb r3, [r1, #1]
  269. strb r3, [r0, #1]
  270. ldrb r3, [r1, #2]
  271. strb r3, [r0, #2]
  272. #ifdef __ARM_FEATURE_UNALIGNED
  273. mov r0, ip
  274. #else
  275. pop {r0}
  276. #endif
  277. bx lr
  278. .size memcpy, .-memcpy
  279. #endif
  280. #endif // __MK20DX128__ || __MK20DX256__ || __MK66FX1M0__