openwrt/toolchain/uClibc/patches/402-avr32-string-ops.patch

1140 lines
23 KiB
Diff

Subject: [PATCH] AVR32-optimized string operations
Add hand-optimized AVR32-specific string operations. Some of them
need a bit more testing, though.
---
libc/string/avr32/Makefile | 40 +++++++++++
libc/string/avr32/bcopy.S | 15 ++++
libc/string/avr32/bzero.S | 12 +++
libc/string/avr32/memchr.S | 62 +++++++++++++++++
libc/string/avr32/memcmp.S | 50 +++++++++++++
libc/string/avr32/memcpy.S | 110 ++++++++++++++++++++++++++++++
libc/string/avr32/memmove.S | 114 +++++++++++++++++++++++++++++++
libc/string/avr32/memset.S | 60 ++++++++++++++++
libc/string/avr32/strcat.S | 95 ++++++++++++++++++++++++++
libc/string/avr32/strcmp.S | 80 ++++++++++++++++++++++
libc/string/avr32/strcpy.S | 63 +++++++++++++++++
libc/string/avr32/stringtest.c | 144 ++++++++++++++++++++++++++++++++++++++++
libc/string/avr32/strlen.S | 52 ++++++++++++++
libc/string/avr32/strncpy.S | 77 +++++++++++++++++++++
libc/string/avr32/test_memcpy.c | 66 ++++++++++++++++++
15 files changed, 1040 insertions(+)
Index: uClibc-0.9.28-avr32/libc/string/avr32/bcopy.S
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ uClibc-0.9.28-avr32/libc/string/avr32/bcopy.S 2006-10-19 15:05:52.000000000 +0200
@@ -0,0 +1,15 @@
+/*
+ * Copyright (C) 2004 Atmel Norway
+ */
+
+ .text
+ .global bcopy
+ .type bcopy, @function
+ .align 1
+bcopy:
+ /* Swap the first two arguments */
+ eor r11, r12
+ eor r12, r11
+ eor r11, r12
+ rjmp __memmove
+ .size bcopy, . - bcopy
Index: uClibc-0.9.28-avr32/libc/string/avr32/bzero.S
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ uClibc-0.9.28-avr32/libc/string/avr32/bzero.S 2006-10-19 15:05:52.000000000 +0200
@@ -0,0 +1,12 @@
+/*
+ * Copyright (C) 2004 Atmel Norway
+ */
+
+ .text
+ .global bzero
+ .type bzero, @function
+ .align 1
+bzero:
+ mov r10, r11
+ mov r11, 0
+ rjmp __memset
Index: uClibc-0.9.28-avr32/libc/string/avr32/Makefile
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ uClibc-0.9.28-avr32/libc/string/avr32/Makefile 2006-10-19 15:05:52.000000000 +0200
@@ -0,0 +1,40 @@
+# Makefile for uClibc
+#
+# Copyright (C) 2000-2003 Erik Andersen <andersen@uclibc.org>
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU Library General Public License as published by the Free
+# Software Foundation; either version 2 of the License, or (at your option) any
+# later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Library General Public License
+# along with this program; if not, write to the Free Software Foundation, Inc.,
+# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+TOPDIR=../../../
+include $(TOPDIR)Rules.mak
+
+SSRC := bcopy.S bzero.S memcmp.S memcpy.S memmove.S
+SSRC += memset.S strcmp.S strlen.S
+# memchr.S, strcat.S, strcpy.S, strncpy.S is broken
+SOBJS := $(patsubst %.S,%.o, $(SSRC))
+OBJS := $(SOBJS)
+
+OBJ_LIST:= ../../obj.string.$(TARGET_ARCH)
+
+all: $(OBJ_LIST)
+
+$(OBJ_LIST): $(OBJS)
+ echo $(addprefix string/$(TARGET_ARCH)/, $(OBJS)) > $@
+
+$(SOBJS): %.o: %.S
+ $(CC) $(ASFLAGS) -c $< -o $@
+ $(STRIPTOOL) -x -R .note -R .comment $@
+
+clean:
+ $(RM) *.[oa] *~ core
Index: uClibc-0.9.28-avr32/libc/string/avr32/memchr.S
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ uClibc-0.9.28-avr32/libc/string/avr32/memchr.S 2006-10-19 15:05:52.000000000 +0200
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2004 Atmel Norway
+ */
+
+#define str r12
+#define chr r11
+#define len r10
+
+ .text
+ .global memchr
+ .type memchr, @function
+memchr:
+ or chr, chr, chr << 8
+ or chr, chr, chr << 16
+
+ mov r9, str
+ andl r9, 3, COH
+ brne .Lunaligned_str
+
+1: sub len, 4
+ brlt 2f
+ ld.w r8, str++
+ psub.b r9, r8, r11
+ tnbz r9
+ brne 1b
+
+ sub str, 4
+ bfextu r9, r8, 24, 8
+ cp.b r9, r11
+ reteq str
+ sub str, -1
+ bfextu r9, r8, 16, 8
+ cp.b r9, r11
+ reteq str
+ sub str, -1
+ bfextu r9, r8, 8, 8
+ cp.b r9, r11
+ reteq str
+ sub str, -1
+ retal str
+
+2: sub len, -4
+ reteq 0
+
+3: ld.ub r8, str++
+ cp.w r8, 0
+ reteq str
+ sub len, 1
+ brne 3b
+
+ retal 0
+
+.Lunaligned_str:
+1: sub len, 1
+ retlt 0
+ ld.ub r8, str++
+ cp.b r8, r11
+ reteq str
+ sub r9, 1
+ brge 1b
+
+ rjmp .Laligned_search
Index: uClibc-0.9.28-avr32/libc/string/avr32/memcmp.S
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ uClibc-0.9.28-avr32/libc/string/avr32/memcmp.S 2006-10-20 10:42:09.000000000 +0200
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2004 Atmel Norway.
+ */
+
+#define s1 r12
+#define s2 r11
+#define len r10
+
+ .text
+ .global memcmp
+ .type memcmp, @function
+ .align 1
+memcmp:
+ sub len, 4
+ brlt .Lless_than_4
+
+1: ld.w r8, s1++
+ ld.w r9, s2++
+ cp.w r8, r9
+ brne .Lfound_word
+ sub len, 4
+ brge 1b
+
+.Lless_than_4:
+ sub len, -4
+ reteq 0
+
+1: ld.ub r8, s1++
+ ld.ub r9, s2++
+ sub r8, r9
+ retne r8
+ sub len, 1
+ brgt 1b
+
+ retal 0
+
+.Lfound_word:
+ psub.b r9, r8, r9
+ bfextu r8, r9, 24, 8
+ retne r8
+ bfextu r8, r9, 16, 8
+ retne r8
+ bfextu r8, r9, 8, 8
+ retne r8
+ retal r9
+
+ .size memcmp, . - memcmp
+
+ .weak bcmp
+ bcmp = memcmp
Index: uClibc-0.9.28-avr32/libc/string/avr32/memcpy.S
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ uClibc-0.9.28-avr32/libc/string/avr32/memcpy.S 2006-10-19 15:05:52.000000000 +0200
@@ -0,0 +1,110 @@
+/*
+ * Copyright (C) 2004 Atmel Norway
+ */
+
+/* Don't use r12 as dst since we must return it unmodified */
+#define dst r9
+#define src r11
+#define len r10
+
+ .text
+ .global memcpy
+ .type memcpy, @function
+
+ .global __memcpy
+ .hidden __memcpy
+ .type __memcpy, @function
+memcpy:
+__memcpy:
+ pref src[0]
+ mov dst, r12
+
+ /* If we have less than 32 bytes, don't do anything fancy */
+ cp.w len, 32
+ brge .Lmore_than_31
+
+ sub len, 1
+ retlt r12
+1: ld.ub r8, src++
+ st.b dst++, r8
+ sub len, 1
+ brge 1b
+ retal r12
+
+.Lmore_than_31:
+ pushm r0-r7, lr
+
+ /* Check alignment */
+ mov r8, src
+ andl r8, 31, COH
+ brne .Lunaligned_src
+ mov r8, dst
+ andl r8, 3, COH
+ brne .Lunaligned_dst
+
+.Laligned_copy:
+ sub len, 32
+ brlt .Lless_than_32
+
+1: /* Copy 32 bytes at a time */
+ ldm src, r0-r7
+ sub src, -32
+ stm dst, r0-r7
+ sub dst, -32
+ sub len, 32
+ brge 1b
+
+.Lless_than_32:
+ /* Copy 16 more bytes if possible */
+ sub len, -16
+ brlt .Lless_than_16
+ ldm src, r0-r3
+ sub src, -16
+ sub len, 16
+ stm dst, r0-r3
+ sub dst, -16
+
+.Lless_than_16:
+ /* Do the remaining as byte copies */
+ neg len
+ add pc, pc, len << 2
+ .rept 15
+ ld.ub r0, src++
+ st.b dst++, r0
+ .endr
+
+ popm r0-r7, pc
+
+.Lunaligned_src:
+ /* Make src cacheline-aligned. r8 = (src & 31) */
+ rsub r8, r8, 32
+ sub len, r8
+1: ld.ub r0, src++
+ st.b dst++, r0
+ sub r8, 1
+ brne 1b
+
+ /* If dst is word-aligned, we're ready to go */
+ pref src[0]
+ mov r8, 3
+ tst dst, r8
+ breq .Laligned_copy
+
+.Lunaligned_dst:
+ /* src is aligned, but dst is not. Expect bad performance */
+ sub len, 4
+ brlt 2f
+1: ld.w r0, src++
+ st.w dst++, r0
+ sub len, 4
+ brge 1b
+
+2: neg len
+ add pc, pc, len << 2
+ .rept 3
+ ld.ub r0, src++
+ st.b dst++, r0
+ .endr
+
+ popm r0-r7, pc
+ .size memcpy, . - memcpy
Index: uClibc-0.9.28-avr32/libc/string/avr32/memmove.S
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ uClibc-0.9.28-avr32/libc/string/avr32/memmove.S 2006-10-19 15:05:52.000000000 +0200
@@ -0,0 +1,114 @@
+/*
+ * Copyright (C) 2004 Atmel Norway
+ */
+
+#define dst r12
+#define src r11
+#define len r10
+
+ .text
+ .global memmove
+ .type memmove, @function
+
+ .global __memmove
+ .hidden __memmove
+ .type __memmove, @function
+memmove:
+__memmove:
+ cp.w src, dst
+ brge __memcpy
+
+ add dst, len
+ add src, len
+ pref src[-1]
+
+ /*
+ * The rest is basically the same as in memcpy.S except that
+ * the direction is reversed.
+ */
+ cp.w len, 32
+ brge .Lmore_than_31
+
+ sub len, 1
+ retlt r12
+1: ld.ub r8, --src
+ st.b --dst, r8
+ sub len, 1
+ brge 1b
+ retal r12
+
+.Lmore_than_31:
+ pushm r0-r7, lr
+
+ /* Check alignment */
+ mov r8, src
+ andl r8, 31, COH
+ brne .Lunaligned_src
+ mov r8, r12
+ andl r8, 3, COH
+ brne .Lunaligned_dst
+
+.Laligned_copy:
+ sub len, 32
+ brlt .Lless_than_32
+
+1: /* Copy 32 bytes at a time */
+ sub src, 32
+ ldm src, r0-r7
+ sub dst, 32
+ sub len, 32
+ stm dst, r0-r7
+ brge 1b
+
+.Lless_than_32:
+ /* Copy 16 more bytes if possible */
+ sub len, -16
+ brlt .Lless_than_16
+ sub src, 16
+ ldm src, r0-r3
+ sub dst, 16
+ sub len, 16
+ stm dst, r0-r3
+
+.Lless_than_16:
+ /* Do the remaining as byte copies */
+ sub len, -16
+ breq 2f
+1: ld.ub r0, --src
+ st.b --dst, r0
+ sub len, 1
+ brne 1b
+
+2: popm r0-r7, pc
+
+.Lunaligned_src:
+ /* Make src cacheline-aligned. r8 = (src & 31) */
+ sub len, r8
+1: ld.ub r0, --src
+ st.b --dst, r0
+ sub r8, 1
+ brne 1b
+
+ /* If dst is word-aligned, we're ready to go */
+ pref src[-4]
+ mov r8, 3
+ tst dst, r8
+ breq .Laligned_copy
+
+.Lunaligned_dst:
+ /* src is aligned, but dst is not. Expect bad performance */
+ sub len, 4
+ brlt 2f
+1: ld.w r0, --src
+ st.w --dst, r0
+ sub len, 4
+ brge 1b
+
+2: neg len
+ add pc, pc, len << 2
+ .rept 3
+ ld.ub r0, --src
+ st.b --dst, r0
+ .endr
+
+ popm r0-r7, pc
Index: uClibc-0.9.28-avr32/libc/string/avr32/memset.S
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ uClibc-0.9.28-avr32/libc/string/avr32/memset.S 2006-10-20 10:42:15.000000000 +0200
@@ -0,0 +1,60 @@
+/*
+ * Copyright (C) 2004 Atmel Norway.
+ */
+
+#define s r12
+#define c r11
+#define n r10
+
+ .text
+ .global memset
+ .type memset, @function
+
+ .global __memset
+ .hidden __memset
+ .type __memset, @function
+
+ .align 1
+memset:
+__memset:
+ cp.w n, 32
+ mov r9, s
+ brge .Llarge_memset
+
+ sub n, 1
+ retlt s
+1: st.b s++, c
+ sub n, 1
+ brge 1b
+
+ retal r9
+
+.Llarge_memset:
+ mov r8, r11
+ mov r11, 3
+ bfins r8, r8, 8, 8
+ bfins r8, r8, 16, 16
+ tst s, r11
+ breq 2f
+
+1: st.b s++, r8
+ sub n, 1
+ tst s, r11
+ brne 1b
+
+2: mov r11, r9
+ mov r9, r8
+ sub n, 8
+
+3: st.d s++, r8
+ sub n, 8
+ brge 3b
+
+ /* If we are done, n == -8 and we'll skip all st.b insns below */
+ neg n
+ lsl n, 1
+ add pc, n
+ .rept 7
+ st.b s++, r8
+ .endr
+ retal r11
Index: uClibc-0.9.28-avr32/libc/string/avr32/strcat.S
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ uClibc-0.9.28-avr32/libc/string/avr32/strcat.S 2006-10-19 15:05:52.000000000 +0200
@@ -0,0 +1,95 @@
+/*
+ * Copyright (C) 2004 Atmel Norway
+ */
+
+#define s1 r9
+#define s2 r11
+
+ .text
+ .global strcat
+ .type strcat, @function
+ .align 1
+strcat:
+ mov s1, r12
+
+ /* Make sure s1 is word-aligned */
+ mov r10, s1
+ andl r10, 3, COH
+ breq 2f
+
+ add pc, pc, r10 << 3
+ sub r0, r0, 0 /* 4-byte nop */
+ ld.ub r8, s1++
+ sub r8, r8, 0
+ breq 2f
+ ld.ub r8, s1++
+ sub r8, r8, 0
+ breq 3f
+ ld.ub r8, s1++
+ sub r8, r8, 0
+ breq 4f
+
+ /* Find the end of the first string */
+5: ld.w r8, s1++
+ tnbz r8
+ brne 5b
+
+ sub s1, 4
+
+ bfextu r10, r8, 24, 8
+ cp.w r10, 0
+ breq 1f
+ sub s1, -1
+ bfextu r10, r8, 16, 8
+ cp.w r10, 0
+ breq 2f
+ sub s1, -1
+ bfextu r10, r8, 8, 8
+ cp.w r10, 0
+ breq 3f
+ sub s1, -1
+ rjmp 4f
+
+ /* Now, append s2 */
+1: ld.ub r8, s2++
+ st.b s1++, r8
+ cp.w r8, 0
+ reteq r12
+2: ld.ub r8, s2++
+ st.b s1++, r8
+ cp.w r8, 0
+ reteq r12
+3: ld.ub r8, s2++
+ st.b s1++, r8
+ cp.w r8, 0
+ reteq r12
+4: ld.ub r8, s2++
+ st.b s1++, r8
+ cp.w r8, 0
+ reteq r12
+
+ /* Copy one word at a time */
+ ld.w r8, s2++
+ tnbz r8
+ breq 2f
+1: st.w r8, s2++
+ ld.w r8, s2++
+ tnbz r8
+ brne 1b
+
+ /* Copy the remaining bytes */
+ bfextu r10, r8, 24, 8
+ st.b s1++, r10
+ cp.w r10, 0
+ reteq r12
+ bfextu r10, r8, 16, 8
+ st.b s1++, r10
+ cp.w r10, 0
+ reteq r12
+ bfextu r10, r8, 8, 8
+ st.b s1++, r10
+ cp.w r10, 0
+ reteq r12
+ st.b s1++, r8
+ retal r12
+ .size strcat, . - strcat
Index: uClibc-0.9.28-avr32/libc/string/avr32/strcmp.S
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ uClibc-0.9.28-avr32/libc/string/avr32/strcmp.S 2006-10-19 15:05:52.000000000 +0200
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2004 Atmel Norway.
+ */
+
+#define s1 r12
+#define s2 r11
+#define len r10
+
+ .text
+ .global strcmp
+ .type strcmp, @function
+ .align 1
+strcmp:
+ mov r8, 3
+ tst s1, r8
+ brne .Lunaligned_s1
+ tst s2, r8
+ brne .Lunaligned_s2
+
+1: ld.w r8, s1++
+ ld.w r9, s2++
+ cp.w r8, r9
+ brne 2f
+ tnbz r8
+ brne 1b
+ retal 0
+
+2: bfextu r12, r8, 24, 8
+ bfextu r11, r9, 24, 8
+ sub r12, r11
+ retne r12
+ cp.w r11, 0
+ reteq 0
+ bfextu r12, r8, 16, 8
+ bfextu r11, r9, 16, 8
+ sub r12, r11
+ retne r12
+ cp.w r11, 0
+ reteq 0
+ bfextu r12, r8, 8, 8
+ bfextu r11, r9, 8, 8
+ sub r12, r11
+ retne r12
+ cp.w r11, 0
+ reteq 0
+ bfextu r12, r8, 0, 8
+ bfextu r11, r9, 0, 8
+ sub r12, r11
+ retal r12
+
+.Lunaligned_s1:
+3: tst s1, r8
+ breq 4f
+ ld.ub r10, s1++
+ ld.ub r9, s2++
+ sub r10, r9
+ retne r10
+ cp.w r9, 0
+ brne 3b
+ retal r10
+
+4: tst s2, r8
+ breq 1b
+
+.Lunaligned_s2:
+ /*
+ * s1 and s2 can't both be aligned, and unaligned word loads
+ * can trigger spurious exceptions if we cross a page boundary.
+ * Do it the slow way...
+ */
+1: ld.ub r8, s1++
+ ld.ub r9, s2++
+ sub r8, r9
+ retne r8
+ cp.w r9, 0
+ brne 1b
+ retal 0
+
+ .weak strcoll
+ strcoll = strcmp
Index: uClibc-0.9.28-avr32/libc/string/avr32/strcpy.S
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ uClibc-0.9.28-avr32/libc/string/avr32/strcpy.S 2006-10-19 15:05:52.000000000 +0200
@@ -0,0 +1,63 @@
+/*
+ * Copyright (C) 2004 Atmel Norway
+ *
+ * To reduce the size, this one might simply call strncpy with len = -1.
+ */
+
+#define dst r9
+#define src r11
+
+ .text
+ .global strcpy
+ .type strcpy, @function
+strcpy:
+ mov dst, r12
+
+ pref src[0]
+
+ /*
+ * Check alignment. If src is aligned but dst isn't, we can't
+ * do much about it...
+ */
+ mov r8, src
+ andl r8, 3 COH
+ brne .Lunaligned_src
+
+.Laligned_copy:
+1: ld.w r8, src++
+ tnbz r8
+ breq 2f
+ st.w dst++, r8
+ rjmp 1b
+
+2: /*
+ * Ok, r8 now contains the terminating '\0'. Copy the
+ * remaining bytes individually.
+ */
+ bfextu r10, r8, 24, 8
+ st.b dst++, r10
+ cp.w r10, 0
+ reteq r12
+ bfextu r10, r8, 16, 8
+ st.b dst++, r10
+ cp.w r10, 0
+ reteq r12
+ bfextu r10, r8, 8, 8
+ st.b dst++, r10
+ cp.w r10, 0
+ reteq r12
+ st.b dst++, r8
+ retal r12
+
+.Lunaligned_src:
+ /* Copy bytes until we're aligned */
+ rsub r8, r8, 4
+ add pc, pc, r8 << 3
+ nop
+ nop
+ ld.ub r10, src++
+ st.b dst++, r10
+ cp.w r10, 0
+ reteq r12
+
+ rjmp .Laligned_copy
Index: uClibc-0.9.28-avr32/libc/string/avr32/stringtest.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ uClibc-0.9.28-avr32/libc/string/avr32/stringtest.c 2006-10-19 15:05:52.000000000 +0200
@@ -0,0 +1,144 @@
+
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <sys/mman.h>
+
+#define BUF_SIZE (8 * 1024)
+
+static char *buf1;
+static char *buf1_ref;
+static char *buf2;
+
+extern void *optimized_memcpy(void *dest, void *src, size_t len);
+extern void *optimized_memmove(void *dest, void *src, size_t len);
+extern char *optimized_strcpy(char *dest, char *src);
+extern char *optimized_strncpy(char *dest, char *src, size_t len);
+
+void dump_mismatch(char *buf, char *ref, size_t len)
+{
+ int i, j;
+
+ for (i = 0; i < len; i += 16) {
+ if (memcmp(buf + i, ref + i, 16) == 0)
+ continue;
+
+ printf("%4x buf:", i);
+ for (j = i; j < (i + 16); j++)
+ printf(" %02x", buf[j]);
+ printf("\n ref:");
+ for (j = i; j < (i + 16); j++)
+ printf(" %02x", ref[j]);
+ printf("\n");
+ }
+}
+
+static void test_memcpy(int src_offset, int dst_offset, int len)
+{
+ clock_t start, old, new;
+ int i;
+
+ memset(buf1, 0x55, BUF_SIZE);
+ memset(buf1_ref, 0x55, BUF_SIZE);
+ memset(buf2, 0xaa, BUF_SIZE);
+
+ printf("Testing memcpy with offsets %d => %d and len %d...",
+ src_offset, dst_offset, len);
+
+ start = clock();
+ for (i = 0; i < 8192; i++)
+ optimized_memcpy(buf1 + dst_offset, buf2 + src_offset, len);
+ new = clock() - start;
+ start = clock();
+ for ( i = 0; i < 8192; i++)
+ memcpy(buf1_ref + dst_offset, buf2 + src_offset, len);
+ old = clock() - start;
+
+ if (memcmp(buf1, buf1_ref, BUF_SIZE) == 0)
+ printf("OK\n");
+ else {
+ printf("FAILED\n");
+ dump_mismatch(buf1, buf1_ref, BUF_SIZE);
+ }
+ printf("CPU time used: %d vs. %d\n", new, old);
+}
+
+static void test_memmove(int src_offset, int dst_offset, int len)
+{
+ clock_t start, old, new;
+
+ memset(buf1, 0x55, BUF_SIZE);
+ memset(buf1_ref, 0x55, BUF_SIZE);
+ memset(buf2, 0xaa, BUF_SIZE);
+
+ printf("Testing memmove with offsets %d => %d and len %d...",
+ src_offset, dst_offset, len);
+
+ start = clock();
+ optimized_memmove(buf1 + dst_offset, buf2 + src_offset, len);
+ new = clock() - start;
+ start = clock();
+ memmove(buf1_ref + dst_offset, buf2 + src_offset, len);
+ old = clock() - start;
+
+ if (memcmp(buf1, buf1_ref, BUF_SIZE) == 0)
+ printf("OK\n");
+ else {
+ printf("FAILED\n");
+ dump_mismatch(buf1, buf1_ref, BUF_SIZE);
+ }
+ printf("CPU time used: %d vs. %d\n", new, old);
+}
+
+int main(int argc, char *argv[])
+{
+ buf2 = mmap(NULL, BUF_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+ if (buf2 == MAP_FAILED) {
+ perror("Failed to allocate memory for buf2");
+ return 1;
+ }
+ buf1 = mmap(NULL, BUF_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+ if (buf1 == MAP_FAILED) {
+ perror("Failed to allocate memory for buf1");
+ return 1;
+ }
+ buf1_ref = mmap(NULL, BUF_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+ if (buf1_ref == MAP_FAILED) {
+ perror("Failed to allocate memory for buf1_ref");
+ return 1;
+ }
+ printf("\n === MEMCPY ===\n\n");
+
+ test_memcpy(0, 0, BUF_SIZE - 32);
+ test_memcpy(0, 0, 1);
+ test_memcpy(0, 0, 31);
+ test_memcpy(0, 0, 32);
+ test_memcpy(0, 0, 127);
+ test_memcpy(0, 0, 128);
+ test_memcpy(4, 4, BUF_SIZE - 32 - 4);
+ test_memcpy(1, 1, BUF_SIZE - 32 - 1);
+ test_memcpy(1, 1, 126);
+ test_memcpy(0, 3, 128);
+ test_memcpy(1, 4, 128);
+ test_memcpy(0, 0, 0);
+
+ printf("\n === MEMMOVE ===\n\n");
+
+ test_memmove(0, 0, BUF_SIZE - 32);
+ test_memmove(0, 0, 1);
+ test_memmove(0, 0, 31);
+ test_memmove(0, 0, 32);
+ test_memmove(0, 0, BUF_SIZE - 33);
+ test_memmove(0, 0, 128);
+ test_memmove(4, 4, BUF_SIZE - 32 - 4);
+ test_memmove(1, 1, BUF_SIZE - 32 - 1);
+ test_memmove(1, 1, BUF_SIZE - 130);
+ test_memmove(0, 3, BUF_SIZE - 128);
+ test_memmove(1, 4, BUF_SIZE - 128);
+ test_memmove(0, 0, 0);
+
+ return 0;
+}
Index: uClibc-0.9.28-avr32/libc/string/avr32/strlen.S
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ uClibc-0.9.28-avr32/libc/string/avr32/strlen.S 2006-10-19 15:05:52.000000000 +0200
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2004 Atmel Norway
+ */
+
+#define str r12
+
+ .text
+ .global strlen
+ .type strlen, @function
+strlen:
+ mov r11, r12
+
+ mov r9, str
+ andl r9, 3, COH
+ brne .Lunaligned_str
+
+1: ld.w r8, str++
+ tnbz r8
+ brne 1b
+
+ sub r12, r11
+ bfextu r9, r8, 24, 8
+ cp.w r9, 0
+ subeq r12, 4
+ reteq r12
+ bfextu r9, r8, 16, 8
+ cp.w r9, 0
+ subeq r12, 3
+ reteq r12
+ bfextu r9, r8, 8, 8
+ cp.w r9, 0
+ subeq r12, 2
+ reteq r12
+ sub r12, 1
+ retal r12
+
+.Lunaligned_str:
+ add pc, pc, r9 << 3
+ sub r0, r0, 0 /* 4-byte nop */
+ ld.ub r8, str++
+ sub r8, r8, 0
+ breq 1f
+ ld.ub r8, str++
+ sub r8, r8, 0
+ breq 1f
+ ld.ub r8, str++
+ sub r8, r8, 0
+ brne 1b
+
+1: sub r12, 1
+ sub r12, r11
+ retal r12
Index: uClibc-0.9.28-avr32/libc/string/avr32/strncpy.S
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ uClibc-0.9.28-avr32/libc/string/avr32/strncpy.S 2006-10-19 15:05:52.000000000 +0200
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2004 Atmel Norway
+ */
+
+#define dst r9
+#define src r11
+
+ .text
+ .global strcpy
+ .type strncpy, @function
+strncpy:
+ mov dst, r12
+
+ pref src[0]
+ mov dst, r12
+
+ /*
+ * Check alignment. If src is aligned but dst isn't, we can't
+ * do much about it...
+ */
+ mov r8, src
+ andl r8, 3 COH
+ brne .Lunaligned_src
+
+.Laligned_copy:
+ sub r10, 4
+ brlt 3f
+1: ld.w r8, src++
+ tnbz r8
+ breq 2f
+ st.w dst++, r8
+ sub r10, 4
+ brne 1b
+
+3: sub r10, -4
+ reteq r12
+
+ /* This is safe as long as src is word-aligned and r10 > 0 */
+ ld.w r8, src++
+
+2: /*
+ * Ok, r8 now contains the terminating '\0'. Copy the
+ * remaining bytes individually.
+ */
+ bfextu r11, r8, 24, 8
+ st.b dst++, r11
+ cp.w r11, 0
+ reteq r12
+ sub r10, 1
+ reteq r12
+ bfextu r11, r8, 16, 8
+ st.b dst++, r11
+ cp.w r11, 0
+ reteq r12
+ sub r10, 1
+ reteq r12
+ bfextu r11, r8, 8, 8
+ st.b dst++, r11
+ cp.w r11, 0
+ reteq r12
+ sub r10, 1
+ reteq r12
+ st.b dst++, r8
+ retal r12
+
+.Lunaligned_src:
+ /* Copy bytes until we're aligned */
+ min r8, r8, r10
+ sub r10, r8
+ sub r8, 1
+ retlt r12
+1: ld.ub r10, src++
+ st.b dst++, r10
+ sub r8, 1
+ brge 1b
+
+ rjmp .Laligned_copy
Index: uClibc-0.9.28-avr32/libc/string/avr32/test_memcpy.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ uClibc-0.9.28-avr32/libc/string/avr32/test_memcpy.c 2006-10-19 15:05:52.000000000 +0200
@@ -0,0 +1,66 @@
+
+#include <stdio.h>
+#include <string.h>
+
+#define BUF_SIZE 32768
+
+static char buf1[BUF_SIZE] __attribute__((aligned(32)));
+static char buf1_ref[BUF_SIZE] __attribute__((aligned(32)));
+static char buf2[BUF_SIZE] __attribute__((aligned(32)));
+
+extern void *new_memcpy(void *dest, void *src, size_t len);
+
+void dump_mismatch(char *buf, char *ref, size_t len)
+{
+ int i, j;
+
+ for (i = 0; i < len; i += 16) {
+ if (memcmp(buf + i, ref + i, 16) == 0)
+ continue;
+
+ printf("% 4x buf:", i);
+ for (j = i; j < (i + 16); j++)
+ printf(" %02x", buf[j]);
+ printf("\n ref:");
+ for (j = i; j < (i + 16); j++)
+ printf(" %02x", ref[j]);
+ printf("\n");
+ }
+}
+
+void test(int src_offset, int dst_offset, int len)
+{
+ memset(buf1, 0x55, sizeof(buf1));
+ memset(buf1_ref, 0x55, sizeof(buf1_ref));
+ memset(buf2, 0xaa, sizeof(buf2));
+
+ printf("Testing with offsets %d => %d and len %d...",
+ src_offset, dst_offset, len);
+
+ new_memcpy(buf1 + dst_offset, buf2 + src_offset, len);
+ memcpy(buf1_ref + dst_offset, buf2 + src_offset, len);
+
+ if (memcmp(buf1, buf1_ref, sizeof(buf1)) == 0)
+ printf("OK\n");
+ else {
+ printf("FAILED\n");
+ dump_mismatch(buf1, buf1_ref, sizeof(buf1));
+ }
+}
+
+int main(int argc, char *argv[])
+{
+ test(0, 0, BUF_SIZE);
+ test(0, 0, 1);
+ test(0, 0, 31);
+ test(0, 0, 32);
+ test(0, 0, 127);
+ test(0, 0, 128);
+ test(4, 4, BUF_SIZE - 4);
+ test(1, 1, BUF_SIZE - 1);
+ test(1, 1, 126);
+ test(0, 3, 128);
+ test(1, 4, 128);
+
+ return 0;
+}