Teensy 4.1 core updated for C++20

memcpy-armv7m.S 7.9KB

6 anos atrás
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332
  1. /*
  2. * Copyright (c) 2013 ARM Ltd
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions
  7. * are met:
  8. * 1. Redistributions of source code must retain the above copyright
  9. * notice, this list of conditions and the following disclaimer.
  10. * 2. Redistributions in binary form must reproduce the above copyright
  11. * notice, this list of conditions and the following disclaimer in the
  12. * documentation and/or other materials provided with the distribution.
  13. * 3. The name of the company may not be used to endorse or promote
  14. * products derived from this software without specific prior written
  15. * permission.
  16. *
  17. * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
  18. * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  19. * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  20. * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  21. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
  22. * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  23. * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  24. * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  25. * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  26. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27. */
  28. #if defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__)
  29. /*
  30. * Let __ARM_FEATURE_UNALIGNED be set by the achitechture and the compiler flags:
  31. * -munaligned-access
  32. * -mno-unaligned-access
  33. * instead of always setting it here.
  34. *
  35. * #define __ARM_FEATURE_UNALIGNED 1
  36. */
  37. /* This memcpy routine is optimised for Cortex-M3/M4 cores with/without
  38. unaligned access.
  39. If compiled with GCC, this file should be enclosed within following
  40. pre-processing check:
  41. if defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__)
  42. Prototype: void *memcpy (void *dst, const void *src, size_t count);
  43. The job will be done in 5 steps.
  44. Step 1: Align src/dest pointers, copy mis-aligned if fail to align both
  45. Step 2: Repeatedly copy big block size of __OPT_BIG_BLOCK_SIZE
  46. Step 3: Repeatedly copy big block size of __OPT_MID_BLOCK_SIZE
  47. Step 4: Copy word by word
  48. Step 5: Copy byte-to-byte
  49. Tunable options:
  50. __OPT_BIG_BLOCK_SIZE: Size of big block in words. Default to 64.
  51. __OPT_MID_BLOCK_SIZE: Size of big block in words. Default to 16.
  52. */
  53. #ifndef __OPT_BIG_BLOCK_SIZE
  54. #define __OPT_BIG_BLOCK_SIZE (4 * 16)
  55. #endif
  56. #ifndef __OPT_MID_BLOCK_SIZE
  57. #define __OPT_MID_BLOCK_SIZE (4 * 4)
  58. #endif
  59. #if __OPT_BIG_BLOCK_SIZE == 16
  60. #define BEGIN_UNROLL_BIG_BLOCK \
  61. .irp offset, 0,4,8,12
  62. #elif __OPT_BIG_BLOCK_SIZE == 32
  63. #define BEGIN_UNROLL_BIG_BLOCK \
  64. .irp offset, 0,4,8,12,16,20,24,28
  65. #elif __OPT_BIG_BLOCK_SIZE == 64
  66. #define BEGIN_UNROLL_BIG_BLOCK \
  67. .irp offset, 0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60
  68. #else
  69. #error "Illegal __OPT_BIG_BLOCK_SIZE"
  70. #endif
  71. #if __OPT_MID_BLOCK_SIZE == 8
  72. #define BEGIN_UNROLL_MID_BLOCK \
  73. .irp offset, 0,4
  74. #elif __OPT_MID_BLOCK_SIZE == 16
  75. #define BEGIN_UNROLL_MID_BLOCK \
  76. .irp offset, 0,4,8,12
  77. #else
  78. #error "Illegal __OPT_MID_BLOCK_SIZE"
  79. #endif
  80. #define END_UNROLL .endr
  81. .syntax unified
  82. .text
  83. .align 2
  84. .global memcpy
  85. .thumb
  86. .thumb_func
  87. .type memcpy, %function
  88. memcpy:
  89. @ r0: dst
  90. @ r1: src
  91. @ r2: len
  92. #ifdef __ARM_FEATURE_UNALIGNED
  93. /* In case of UNALIGNED access supported, ip is not used in
  94. function body. */
  95. mov ip, r0
  96. #else
  97. push {r0}
  98. #endif
  99. orr r3, r1, r0
  100. ands r3, r3, #3
  101. bne .Lmisaligned_copy
  102. .Lbig_block:
  103. subs r2, __OPT_BIG_BLOCK_SIZE
  104. blo .Lmid_block
  105. /* Kernel loop for big block copy */
  106. .align 2
  107. .Lbig_block_loop:
  108. BEGIN_UNROLL_BIG_BLOCK
  109. #ifdef __ARM_ARCH_7EM__
  110. ldr r3, [r1], #4
  111. str r3, [r0], #4
  112. END_UNROLL
  113. #else /* __ARM_ARCH_7M__ */
  114. ldr r3, [r1, \offset]
  115. str r3, [r0, \offset]
  116. END_UNROLL
  117. adds r0, __OPT_BIG_BLOCK_SIZE
  118. adds r1, __OPT_BIG_BLOCK_SIZE
  119. #endif
  120. subs r2, __OPT_BIG_BLOCK_SIZE
  121. bhs .Lbig_block_loop
  122. .Lmid_block:
  123. adds r2, __OPT_BIG_BLOCK_SIZE - __OPT_MID_BLOCK_SIZE
  124. blo .Lcopy_word_by_word
  125. /* Kernel loop for mid-block copy */
  126. .align 2
  127. .Lmid_block_loop:
  128. BEGIN_UNROLL_MID_BLOCK
  129. #ifdef __ARM_ARCH_7EM__
  130. ldr r3, [r1], #4
  131. str r3, [r0], #4
  132. END_UNROLL
  133. #else /* __ARM_ARCH_7M__ */
  134. ldr r3, [r1, \offset]
  135. str r3, [r0, \offset]
  136. END_UNROLL
  137. adds r0, __OPT_MID_BLOCK_SIZE
  138. adds r1, __OPT_MID_BLOCK_SIZE
  139. #endif
  140. subs r2, __OPT_MID_BLOCK_SIZE
  141. bhs .Lmid_block_loop
  142. .Lcopy_word_by_word:
  143. adds r2, __OPT_MID_BLOCK_SIZE - 4
  144. blo .Lcopy_less_than_4
  145. /* Kernel loop for small block copy */
  146. .align 2
  147. .Lcopy_word_by_word_loop:
  148. ldr r3, [r1], #4
  149. str r3, [r0], #4
  150. subs r2, #4
  151. bhs .Lcopy_word_by_word_loop
  152. .Lcopy_less_than_4:
  153. adds r2, #4
  154. beq .Ldone
  155. lsls r2, r2, #31
  156. itt ne
  157. ldrbne r3, [r1], #1
  158. strbne r3, [r0], #1
  159. bcc .Ldone
  160. #ifdef __ARM_FEATURE_UNALIGNED
  161. ldrh r3, [r1]
  162. strh r3, [r0]
  163. #else
  164. ldrb r3, [r1]
  165. strb r3, [r0]
  166. ldrb r3, [r1, #1]
  167. strb r3, [r0, #1]
  168. #endif /* __ARM_FEATURE_UNALIGNED */
  169. .Ldone:
  170. #ifdef __ARM_FEATURE_UNALIGNED
  171. mov r0, ip
  172. #else
  173. pop {r0}
  174. #endif
  175. bx lr
  176. .align 2
  177. .Lmisaligned_copy:
  178. #ifdef __ARM_FEATURE_UNALIGNED
  179. /* Define label DST_ALIGNED to BIG_BLOCK. It will go to aligned copy
  180. once destination is adjusted to aligned. */
  181. #define Ldst_aligned Lbig_block
  182. /* Copy word by word using LDR when alignment can be done in hardware,
  183. i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */
  184. cmp r2, #8
  185. blo .Lbyte_copy
  186. /* if src is aligned, just go to the big block loop. */
  187. lsls r3, r1, #30
  188. beq .Ldst_aligned
  189. #else
  190. /* if len < 12, misalignment adjustment has more overhead than
  191. just byte-to-byte copy. Also, len must >=8 to guarantee code
  192. afterward work correctly. */
  193. cmp r2, #12
  194. blo .Lbyte_copy
  195. #endif /* __ARM_FEATURE_UNALIGNED */
  196. /* Align dst only, not trying to align src. That is the because
  197. handling of aligned src and misaligned dst need more overhead than
  198. otherwise. By doing this the worst case is when initial src is aligned,
  199. additional up to 4 byte additional copy will executed, which is
  200. acceptable. */
  201. ands r3, r0, #3
  202. beq .Ldst_aligned
  203. rsb r3, #4
  204. subs r2, r3
  205. lsls r3, r3, #31
  206. itt ne
  207. ldrbne r3, [r1], #1
  208. strbne r3, [r0], #1
  209. bcc .Ldst_aligned
  210. #ifdef __ARM_FEATURE_UNALIGNED
  211. ldrh r3, [r1], #2
  212. strh r3, [r0], #2
  213. b .Ldst_aligned
  214. #else
  215. ldrb r3, [r1], #1
  216. strb r3, [r0], #1
  217. ldrb r3, [r1], #1
  218. strb r3, [r0], #1
  219. /* Now that dst is aligned */
  220. .Ldst_aligned:
  221. /* if r1 is aligned now, it means r0/r1 has the same misalignment,
  222. and they are both aligned now. Go aligned copy. */
  223. ands r3, r1, #3
  224. beq .Lbig_block
  225. /* dst is aligned, but src isn't. Misaligned copy. */
  226. push {r4, r5}
  227. subs r2, #4
  228. /* Backward r1 by misaligned bytes, to make r1 aligned.
  229. Since we need to restore r1 to unaligned address after the loop,
  230. we need keep the offset bytes to ip and sub it from r1 afterward. */
  231. subs r1, r3
  232. rsb ip, r3, #4
  233. /* Pre-load on word */
  234. ldr r4, [r1], #4
  235. cmp r3, #2
  236. beq .Lmisaligned_copy_2_2
  237. cmp r3, #3
  238. beq .Lmisaligned_copy_3_1
  239. .macro mis_src_copy shift
  240. 1:
  241. lsrs r4, r4, \shift
  242. ldr r3, [r1], #4
  243. lsls r5, r3, 32-\shift
  244. orr r4, r4, r5
  245. str r4, [r0], #4
  246. mov r4, r3
  247. subs r2, #4
  248. bhs 1b
  249. .endm
  250. .Lmisaligned_copy_1_3:
  251. mis_src_copy shift=8
  252. b .Lsrc_misaligned_tail
  253. .Lmisaligned_copy_3_1:
  254. mis_src_copy shift=24
  255. b .Lsrc_misaligned_tail
  256. .Lmisaligned_copy_2_2:
  257. /* For 2_2 misalignment, ldr is still faster than 2 x ldrh. */
  258. mis_src_copy shift=16
  259. .Lsrc_misaligned_tail:
  260. adds r2, #4
  261. subs r1, ip
  262. pop {r4, r5}
  263. #endif /* __ARM_FEATURE_UNALIGNED */
  264. .Lbyte_copy:
  265. subs r2, #4
  266. blo .Lcopy_less_than_4
  267. .Lbyte_copy_loop:
  268. subs r2, #1
  269. ldrb r3, [r1], #1
  270. strb r3, [r0], #1
  271. bhs .Lbyte_copy_loop
  272. ldrb r3, [r1]
  273. strb r3, [r0]
  274. ldrb r3, [r1, #1]
  275. strb r3, [r0, #1]
  276. ldrb r3, [r1, #2]
  277. strb r3, [r0, #2]
  278. #ifdef __ARM_FEATURE_UNALIGNED
  279. mov r0, ip
  280. #else
  281. pop {r0}
  282. #endif
  283. bx lr
  284. .size memcpy, .-memcpy
  285. #endif