|
- /*
- * Copyright (c) 2013 ARM Ltd
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. The name of the company may not be used to endorse or promote
- * products derived from this software without specific prior written
- * permission.
- *
- * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
- #if defined(__MK20DX128__) || defined(__MK20DX256__) || defined(__MK64FX512__) || defined(__MK66FX1M0__)
- #if defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__)
- #define __ARM_FEATURE_UNALIGNED 1
-
- /* This memcpy routine is optimised for Cortex-M3/M4 cores with/without
- unaligned access.
-
- If compiled with GCC, this file should be enclosed within following
- pre-processing check:
- if defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__)
-
- Prototype: void *memcpy (void *dst, const void *src, size_t count);
-
- The job will be done in 5 steps.
- Step 1: Align src/dest pointers, copy mis-aligned if fail to align both
- Step 2: Repeatedly copy big block size of __OPT_BIG_BLOCK_SIZE
- Step 3: Repeatedly copy big block size of __OPT_MID_BLOCK_SIZE
- Step 4: Copy word by word
- Step 5: Copy byte-to-byte
-
- Tunable options:
- __OPT_BIG_BLOCK_SIZE: Size of big block in words. Default to 64.
- __OPT_MID_BLOCK_SIZE: Size of big block in words. Default to 16.
- */
- #ifndef __OPT_BIG_BLOCK_SIZE
- #define __OPT_BIG_BLOCK_SIZE (4 * 16)
- #endif
-
- #ifndef __OPT_MID_BLOCK_SIZE
- #define __OPT_MID_BLOCK_SIZE (4 * 4)
- #endif
-
- #if __OPT_BIG_BLOCK_SIZE == 16
- #define BEGIN_UNROLL_BIG_BLOCK \
- .irp offset, 0,4,8,12
- #elif __OPT_BIG_BLOCK_SIZE == 32
- #define BEGIN_UNROLL_BIG_BLOCK \
- .irp offset, 0,4,8,12,16,20,24,28
- #elif __OPT_BIG_BLOCK_SIZE == 64
- #define BEGIN_UNROLL_BIG_BLOCK \
- .irp offset, 0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60
- #else
- #error "Illegal __OPT_BIG_BLOCK_SIZE"
- #endif
-
- #if __OPT_MID_BLOCK_SIZE == 8
- #define BEGIN_UNROLL_MID_BLOCK \
- .irp offset, 0,4
- #elif __OPT_MID_BLOCK_SIZE == 16
- #define BEGIN_UNROLL_MID_BLOCK \
- .irp offset, 0,4,8,12
- #else
- #error "Illegal __OPT_MID_BLOCK_SIZE"
- #endif
-
- #define END_UNROLL .endr
-
- .syntax unified
- .text
- .align 2
- .global memcpy
- .thumb
- .thumb_func
- .type memcpy, %function
- memcpy:
- @ r0: dst
- @ r1: src
- @ r2: len
- #ifdef __ARM_FEATURE_UNALIGNED
- /* In case of UNALIGNED access supported, ip is not used in
- function body. */
- mov ip, r0
- #else
- push {r0}
- #endif
- orr r3, r1, r0
- ands r3, r3, #3
- bne .Lmisaligned_copy
-
- .Lbig_block:
- subs r2, __OPT_BIG_BLOCK_SIZE
- blo .Lmid_block
-
- /* Kernel loop for big block copy */
- .align 2
- .Lbig_block_loop:
- BEGIN_UNROLL_BIG_BLOCK
- #ifdef __ARM_ARCH_7EM__
- ldr r3, [r1], #4
- str r3, [r0], #4
- END_UNROLL
- #else /* __ARM_ARCH_7M__ */
- ldr r3, [r1, \offset]
- str r3, [r0, \offset]
- END_UNROLL
- adds r0, __OPT_BIG_BLOCK_SIZE
- adds r1, __OPT_BIG_BLOCK_SIZE
- #endif
- subs r2, __OPT_BIG_BLOCK_SIZE
- bhs .Lbig_block_loop
-
- .Lmid_block:
- adds r2, __OPT_BIG_BLOCK_SIZE - __OPT_MID_BLOCK_SIZE
- blo .Lcopy_word_by_word
-
- /* Kernel loop for mid-block copy */
- .align 2
- .Lmid_block_loop:
- BEGIN_UNROLL_MID_BLOCK
- #ifdef __ARM_ARCH_7EM__
- ldr r3, [r1], #4
- str r3, [r0], #4
- END_UNROLL
- #else /* __ARM_ARCH_7M__ */
- ldr r3, [r1, \offset]
- str r3, [r0, \offset]
- END_UNROLL
- adds r0, __OPT_MID_BLOCK_SIZE
- adds r1, __OPT_MID_BLOCK_SIZE
- #endif
- subs r2, __OPT_MID_BLOCK_SIZE
- bhs .Lmid_block_loop
-
- .Lcopy_word_by_word:
- adds r2, __OPT_MID_BLOCK_SIZE - 4
- blo .Lcopy_less_than_4
-
- /* Kernel loop for small block copy */
- .align 2
- .Lcopy_word_by_word_loop:
- ldr r3, [r1], #4
- str r3, [r0], #4
- subs r2, #4
- bhs .Lcopy_word_by_word_loop
-
- .Lcopy_less_than_4:
- adds r2, #4
- beq .Ldone
-
- lsls r2, r2, #31
- itt ne
- ldrbne r3, [r1], #1
- strbne r3, [r0], #1
-
- bcc .Ldone
- #ifdef __ARM_FEATURE_UNALIGNED
- ldrh r3, [r1]
- strh r3, [r0]
- #else
- ldrb r3, [r1]
- strb r3, [r0]
- ldrb r3, [r1, #1]
- strb r3, [r0, #1]
- #endif /* __ARM_FEATURE_UNALIGNED */
-
- .Ldone:
- #ifdef __ARM_FEATURE_UNALIGNED
- mov r0, ip
- #else
- pop {r0}
- #endif
- bx lr
-
- .align 2
- .Lmisaligned_copy:
- #ifdef __ARM_FEATURE_UNALIGNED
- /* Define label DST_ALIGNED to BIG_BLOCK. It will go to aligned copy
- once destination is adjusted to aligned. */
- #define Ldst_aligned Lbig_block
-
- /* Copy word by word using LDR when alignment can be done in hardware,
- i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */
-
- cmp r2, #8
- blo .Lbyte_copy
-
- /* if src is aligned, just go to the big block loop. */
- lsls r3, r1, #30
- beq .Ldst_aligned
- #else
- /* if len < 12, misalignment adjustment has more overhead than
- just byte-to-byte copy. Also, len must >=8 to guarantee code
- afterward work correctly. */
- cmp r2, #12
- blo .Lbyte_copy
- #endif /* __ARM_FEATURE_UNALIGNED */
-
- /* Align dst only, not trying to align src. That is the because
- handling of aligned src and misaligned dst need more overhead than
- otherwise. By doing this the worst case is when initial src is aligned,
- additional up to 4 byte additional copy will executed, which is
- acceptable. */
-
- ands r3, r0, #3
- beq .Ldst_aligned
-
- rsb r3, #4
- subs r2, r3
-
- lsls r3, r3, #31
- itt ne
- ldrbne r3, [r1], #1
- strbne r3, [r0], #1
-
- bcc .Ldst_aligned
-
- #ifdef __ARM_FEATURE_UNALIGNED
- ldrh r3, [r1], #2
- strh r3, [r0], #2
- b .Ldst_aligned
- #else
- ldrb r3, [r1], #1
- strb r3, [r0], #1
- ldrb r3, [r1], #1
- strb r3, [r0], #1
- /* Now that dst is aligned */
- .Ldst_aligned:
- /* if r1 is aligned now, it means r0/r1 has the same misalignment,
- and they are both aligned now. Go aligned copy. */
- ands r3, r1, #3
- beq .Lbig_block
-
- /* dst is aligned, but src isn't. Misaligned copy. */
-
- push {r4, r5}
- subs r2, #4
-
- /* Backward r1 by misaligned bytes, to make r1 aligned.
- Since we need to restore r1 to unaligned address after the loop,
- we need keep the offset bytes to ip and sub it from r1 afterward. */
- subs r1, r3
- rsb ip, r3, #4
-
- /* Pre-load on word */
- ldr r4, [r1], #4
-
- cmp r3, #2
- beq .Lmisaligned_copy_2_2
- cmp r3, #3
- beq .Lmisaligned_copy_3_1
-
- .macro mis_src_copy shift
- 1:
- lsrs r4, r4, \shift
- ldr r3, [r1], #4
- lsls r5, r3, 32-\shift
- orr r4, r4, r5
- str r4, [r0], #4
- mov r4, r3
- subs r2, #4
- bhs 1b
- .endm
-
- .Lmisaligned_copy_1_3:
- mis_src_copy shift=8
- b .Lsrc_misaligned_tail
-
- .Lmisaligned_copy_3_1:
- mis_src_copy shift=24
- b .Lsrc_misaligned_tail
-
- .Lmisaligned_copy_2_2:
- /* For 2_2 misalignment, ldr is still faster than 2 x ldrh. */
- mis_src_copy shift=16
-
- .Lsrc_misaligned_tail:
- adds r2, #4
- subs r1, ip
- pop {r4, r5}
-
- #endif /* __ARM_FEATURE_UNALIGNED */
-
- .Lbyte_copy:
- subs r2, #4
- blo .Lcopy_less_than_4
-
- .Lbyte_copy_loop:
- subs r2, #1
- ldrb r3, [r1], #1
- strb r3, [r0], #1
- bhs .Lbyte_copy_loop
-
- ldrb r3, [r1]
- strb r3, [r0]
- ldrb r3, [r1, #1]
- strb r3, [r0, #1]
- ldrb r3, [r1, #2]
- strb r3, [r0, #2]
-
- #ifdef __ARM_FEATURE_UNALIGNED
- mov r0, ip
- #else
- pop {r0}
- #endif
- bx lr
-
- .size memcpy, .-memcpy
- #endif
-
- #endif // __MK20DX128__ || __MK20DX256__ || __MK66FX1M0__
|