Teensy 4.1 core updated for C++20

memcpy-armv7m.S 8.0KB

il y a 9 ans
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. /*
  2. * Copyright (c) 2013 ARM Ltd
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions
  7. * are met:
  8. * 1. Redistributions of source code must retain the above copyright
  9. * notice, this list of conditions and the following disclaimer.
  10. * 2. Redistributions in binary form must reproduce the above copyright
  11. * notice, this list of conditions and the following disclaimer in the
  12. * documentation and/or other materials provided with the distribution.
  13. * 3. The name of the company may not be used to endorse or promote
  14. * products derived from this software without specific prior written
  15. * permission.
  16. *
  17. * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
  18. * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  19. * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  20. * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  21. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
  22. * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  23. * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  24. * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  25. * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  26. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27. */
  28. #if defined(__MK20DX128__) || defined(__MK20DX256__) || defined(__MK64FX512__) || defined(__MK66FX1M0__)
  29. #if defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__)
  30. /*
  31. * Let __ARM_FEATURE_UNALIGNED be set by the achitechture and the compiler flags:
  32. * -munaligned-access
  33. * -mno-unaligned-access
  34. * instead of always setting it here.
  35. *
  36. #define __ARM_FEATURE_UNALIGNED 1
  37. */
  38. /* This memcpy routine is optimised for Cortex-M3/M4 cores with/without
  39. unaligned access.
  40. If compiled with GCC, this file should be enclosed within following
  41. pre-processing check:
  42. if defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__)
  43. Prototype: void *memcpy (void *dst, const void *src, size_t count);
  44. The job will be done in 5 steps.
  45. Step 1: Align src/dest pointers, copy mis-aligned if fail to align both
  46. Step 2: Repeatedly copy big block size of __OPT_BIG_BLOCK_SIZE
  47. Step 3: Repeatedly copy big block size of __OPT_MID_BLOCK_SIZE
  48. Step 4: Copy word by word
  49. Step 5: Copy byte-to-byte
  50. Tunable options:
  51. __OPT_BIG_BLOCK_SIZE: Size of big block in words. Default to 64.
  52. __OPT_MID_BLOCK_SIZE: Size of big block in words. Default to 16.
  53. */
  54. #ifndef __OPT_BIG_BLOCK_SIZE
  55. #define __OPT_BIG_BLOCK_SIZE (4 * 16)
  56. #endif
  57. #ifndef __OPT_MID_BLOCK_SIZE
  58. #define __OPT_MID_BLOCK_SIZE (4 * 4)
  59. #endif
  60. #if __OPT_BIG_BLOCK_SIZE == 16
  61. #define BEGIN_UNROLL_BIG_BLOCK \
  62. .irp offset, 0,4,8,12
  63. #elif __OPT_BIG_BLOCK_SIZE == 32
  64. #define BEGIN_UNROLL_BIG_BLOCK \
  65. .irp offset, 0,4,8,12,16,20,24,28
  66. #elif __OPT_BIG_BLOCK_SIZE == 64
  67. #define BEGIN_UNROLL_BIG_BLOCK \
  68. .irp offset, 0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60
  69. #else
  70. #error "Illegal __OPT_BIG_BLOCK_SIZE"
  71. #endif
  72. #if __OPT_MID_BLOCK_SIZE == 8
  73. #define BEGIN_UNROLL_MID_BLOCK \
  74. .irp offset, 0,4
  75. #elif __OPT_MID_BLOCK_SIZE == 16
  76. #define BEGIN_UNROLL_MID_BLOCK \
  77. .irp offset, 0,4,8,12
  78. #else
  79. #error "Illegal __OPT_MID_BLOCK_SIZE"
  80. #endif
  81. #define END_UNROLL .endr
  82. .syntax unified
  83. .text
  84. .align 2
  85. .global memcpy
  86. .thumb
  87. .thumb_func
  88. .type memcpy, %function
  89. memcpy:
  90. @ r0: dst
  91. @ r1: src
  92. @ r2: len
  93. #ifdef __ARM_FEATURE_UNALIGNED
  94. /* In case of UNALIGNED access supported, ip is not used in
  95. function body. */
  96. mov ip, r0
  97. #else
  98. push {r0}
  99. #endif
  100. orr r3, r1, r0
  101. ands r3, r3, #3
  102. bne .Lmisaligned_copy
  103. .Lbig_block:
  104. subs r2, __OPT_BIG_BLOCK_SIZE
  105. blo .Lmid_block
  106. /* Kernel loop for big block copy */
  107. .align 2
  108. .Lbig_block_loop:
  109. BEGIN_UNROLL_BIG_BLOCK
  110. #ifdef __ARM_ARCH_7EM__
  111. ldr r3, [r1], #4
  112. str r3, [r0], #4
  113. END_UNROLL
  114. #else /* __ARM_ARCH_7M__ */
  115. ldr r3, [r1, \offset]
  116. str r3, [r0, \offset]
  117. END_UNROLL
  118. adds r0, __OPT_BIG_BLOCK_SIZE
  119. adds r1, __OPT_BIG_BLOCK_SIZE
  120. #endif
  121. subs r2, __OPT_BIG_BLOCK_SIZE
  122. bhs .Lbig_block_loop
  123. .Lmid_block:
  124. adds r2, __OPT_BIG_BLOCK_SIZE - __OPT_MID_BLOCK_SIZE
  125. blo .Lcopy_word_by_word
  126. /* Kernel loop for mid-block copy */
  127. .align 2
  128. .Lmid_block_loop:
  129. BEGIN_UNROLL_MID_BLOCK
  130. #ifdef __ARM_ARCH_7EM__
  131. ldr r3, [r1], #4
  132. str r3, [r0], #4
  133. END_UNROLL
  134. #else /* __ARM_ARCH_7M__ */
  135. ldr r3, [r1, \offset]
  136. str r3, [r0, \offset]
  137. END_UNROLL
  138. adds r0, __OPT_MID_BLOCK_SIZE
  139. adds r1, __OPT_MID_BLOCK_SIZE
  140. #endif
  141. subs r2, __OPT_MID_BLOCK_SIZE
  142. bhs .Lmid_block_loop
  143. .Lcopy_word_by_word:
  144. adds r2, __OPT_MID_BLOCK_SIZE - 4
  145. blo .Lcopy_less_than_4
  146. /* Kernel loop for small block copy */
  147. .align 2
  148. .Lcopy_word_by_word_loop:
  149. ldr r3, [r1], #4
  150. str r3, [r0], #4
  151. subs r2, #4
  152. bhs .Lcopy_word_by_word_loop
  153. .Lcopy_less_than_4:
  154. adds r2, #4
  155. beq .Ldone
  156. lsls r2, r2, #31
  157. itt ne
  158. ldrbne r3, [r1], #1
  159. strbne r3, [r0], #1
  160. bcc .Ldone
  161. #ifdef __ARM_FEATURE_UNALIGNED
  162. ldrh r3, [r1]
  163. strh r3, [r0]
  164. #else
  165. ldrb r3, [r1]
  166. strb r3, [r0]
  167. ldrb r3, [r1, #1]
  168. strb r3, [r0, #1]
  169. #endif /* __ARM_FEATURE_UNALIGNED */
  170. .Ldone:
  171. #ifdef __ARM_FEATURE_UNALIGNED
  172. mov r0, ip
  173. #else
  174. pop {r0}
  175. #endif
  176. bx lr
  177. .align 2
  178. .Lmisaligned_copy:
  179. #ifdef __ARM_FEATURE_UNALIGNED
  180. /* Define label DST_ALIGNED to BIG_BLOCK. It will go to aligned copy
  181. once destination is adjusted to aligned. */
  182. #define Ldst_aligned Lbig_block
  183. /* Copy word by word using LDR when alignment can be done in hardware,
  184. i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */
  185. cmp r2, #8
  186. blo .Lbyte_copy
  187. /* if src is aligned, just go to the big block loop. */
  188. lsls r3, r1, #30
  189. beq .Ldst_aligned
  190. #else
  191. /* if len < 12, misalignment adjustment has more overhead than
  192. just byte-to-byte copy. Also, len must >=8 to guarantee code
  193. afterward work correctly. */
  194. cmp r2, #12
  195. blo .Lbyte_copy
  196. #endif /* __ARM_FEATURE_UNALIGNED */
  197. /* Align dst only, not trying to align src. That is the because
  198. handling of aligned src and misaligned dst need more overhead than
  199. otherwise. By doing this the worst case is when initial src is aligned,
  200. additional up to 4 byte additional copy will executed, which is
  201. acceptable. */
  202. ands r3, r0, #3
  203. beq .Ldst_aligned
  204. rsb r3, #4
  205. subs r2, r3
  206. lsls r3, r3, #31
  207. itt ne
  208. ldrbne r3, [r1], #1
  209. strbne r3, [r0], #1
  210. bcc .Ldst_aligned
  211. #ifdef __ARM_FEATURE_UNALIGNED
  212. ldrh r3, [r1], #2
  213. strh r3, [r0], #2
  214. b .Ldst_aligned
  215. #else
  216. ldrb r3, [r1], #1
  217. strb r3, [r0], #1
  218. ldrb r3, [r1], #1
  219. strb r3, [r0], #1
  220. /* Now that dst is aligned */
  221. .Ldst_aligned:
  222. /* if r1 is aligned now, it means r0/r1 has the same misalignment,
  223. and they are both aligned now. Go aligned copy. */
  224. ands r3, r1, #3
  225. beq .Lbig_block
  226. /* dst is aligned, but src isn't. Misaligned copy. */
  227. push {r4, r5}
  228. subs r2, #4
  229. /* Backward r1 by misaligned bytes, to make r1 aligned.
  230. Since we need to restore r1 to unaligned address after the loop,
  231. we need keep the offset bytes to ip and sub it from r1 afterward. */
  232. subs r1, r3
  233. rsb ip, r3, #4
  234. /* Pre-load on word */
  235. ldr r4, [r1], #4
  236. cmp r3, #2
  237. beq .Lmisaligned_copy_2_2
  238. cmp r3, #3
  239. beq .Lmisaligned_copy_3_1
  240. .macro mis_src_copy shift
  241. 1:
  242. lsrs r4, r4, \shift
  243. ldr r3, [r1], #4
  244. lsls r5, r3, 32-\shift
  245. orr r4, r4, r5
  246. str r4, [r0], #4
  247. mov r4, r3
  248. subs r2, #4
  249. bhs 1b
  250. .endm
  251. .Lmisaligned_copy_1_3:
  252. mis_src_copy shift=8
  253. b .Lsrc_misaligned_tail
  254. .Lmisaligned_copy_3_1:
  255. mis_src_copy shift=24
  256. b .Lsrc_misaligned_tail
  257. .Lmisaligned_copy_2_2:
  258. /* For 2_2 misalignment, ldr is still faster than 2 x ldrh. */
  259. mis_src_copy shift=16
  260. .Lsrc_misaligned_tail:
  261. adds r2, #4
  262. subs r1, ip
  263. pop {r4, r5}
  264. #endif /* __ARM_FEATURE_UNALIGNED */
  265. .Lbyte_copy:
  266. subs r2, #4
  267. blo .Lcopy_less_than_4
  268. .Lbyte_copy_loop:
  269. subs r2, #1
  270. ldrb r3, [r1], #1
  271. strb r3, [r0], #1
  272. bhs .Lbyte_copy_loop
  273. ldrb r3, [r1]
  274. strb r3, [r0]
  275. ldrb r3, [r1, #1]
  276. strb r3, [r0, #1]
  277. ldrb r3, [r1, #2]
  278. strb r3, [r0, #2]
  279. #ifdef __ARM_FEATURE_UNALIGNED
  280. mov r0, ip
  281. #else
  282. pop {r0}
  283. #endif
  284. bx lr
  285. .size memcpy, .-memcpy
  286. #endif
  287. #endif // __MK20DX128__ || __MK20DX256__ || __MK66FX1M0__