Initial commit; kernel source import

2025-04-06 23:50:55 -05:00
commit 25c6d769f4
45093 changed files with 18199410 additions and 0 deletions
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -0,0 +1,47 @@
+#
+# linux/arch/arm/lib/Makefile
+#
+# Copyright (C) 1995-2000 Russell King
+#
+
+lib-y		:= backtrace.o changebit.o csumipv6.o csumpartial.o   \
+		   csumpartialcopy.o csumpartialcopyuser.o clearbit.o \
+		   delay.o delay-loop.o findbit.o memchr.o memcpy.o   \
+		   memmove.o memset.o memzero.o setbit.o              \
+		   strchr.o strrchr.o                                 \
+		   testchangebit.o testclearbit.o testsetbit.o        \
+		   ashldi3.o ashrdi3.o lshrdi3.o muldi3.o             \
+		   ucmpdi2.o lib1funcs.o div64.o                      \
+		   io-readsb.o io-writesb.o io-readsl.o io-writesl.o  \
+		   call_with_stack.o
+
+mmu-y	:= clear_user.o copy_page.o getuser.o putuser.o
+
+# the code in uaccess.S is not preemption safe and
+# probably faster on ARMv3 only
+ifeq ($(CONFIG_PREEMPT),y)
+  mmu-y	+= copy_from_user.o copy_to_user.o
+else
+ifneq ($(CONFIG_CPU_32v3),y)
+  mmu-y	+= copy_from_user.o copy_to_user.o
+else
+  mmu-y	+= uaccess.o
+endif
+endif
+
+# using lib_ here won't override already available weak symbols
+obj-$(CONFIG_UACCESS_WITH_MEMCPY) += uaccess_with_memcpy.o
+
+lib-$(CONFIG_MMU) += $(mmu-y)
+
+ifeq ($(CONFIG_CPU_32v3),y)
+  lib-y	+= io-readsw-armv3.o io-writesw-armv3.o
+else
+  lib-y	+= io-readsw-armv4.o io-writesw-armv4.o
+endif
+
+lib-$(CONFIG_ARCH_RPC)		+= ecard.o io-acorn.o floppydma.o
+lib-$(CONFIG_ARCH_SHARK)	+= io-shark.o
+
+$(obj)/csumpartialcopy.o:	$(obj)/csumpartialcopygeneric.S
+$(obj)/csumpartialcopyuser.o:	$(obj)/csumpartialcopygeneric.S
--- a/arch/arm/lib/ashldi3.S
+++ b/arch/arm/lib/ashldi3.S
@@ -0,0 +1,53 @@
+/* Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
+   Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+In addition to the permissions in the GNU General Public License, the
+Free Software Foundation gives you unlimited permission to link the
+compiled version of this file into combinations with other programs,
+and to distribute those combinations without any restriction coming
+from the use of this file.  (The General Public License restrictions
+do apply in other respects; for example, they cover modification of
+the file, and distribution when not linked into a combine
+executable.)
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; see the file COPYING.  If not, write to
+the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+
+#include <linux/linkage.h>
+
+#ifdef __ARMEB__
+#define al r1
+#define ah r0
+#else
+#define al r0
+#define ah r1
+#endif
+
+ENTRY(__ashldi3)
+ENTRY(__aeabi_llsl)
+
+	subs	r3, r2, #32
+	rsb	ip, r2, #32
+	movmi	ah, ah, lsl r2
+	movpl	ah, al, lsl r3
+ ARM(	orrmi	ah, ah, al, lsr ip	)
+ THUMB(	lsrmi	r3, al, ip		)
+ THUMB(	orrmi	ah, ah, r3		)
+	mov	al, al, lsl r2
+	mov	pc, lr
+
+ENDPROC(__ashldi3)
+ENDPROC(__aeabi_llsl)
--- a/arch/arm/lib/ashrdi3.S
+++ b/arch/arm/lib/ashrdi3.S
@@ -0,0 +1,53 @@
+/* Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
+   Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+In addition to the permissions in the GNU General Public License, the
+Free Software Foundation gives you unlimited permission to link the
+compiled version of this file into combinations with other programs,
+and to distribute those combinations without any restriction coming
+from the use of this file.  (The General Public License restrictions
+do apply in other respects; for example, they cover modification of
+the file, and distribution when not linked into a combine
+executable.)
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; see the file COPYING.  If not, write to
+the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+
+#include <linux/linkage.h>
+
+#ifdef __ARMEB__
+#define al r1
+#define ah r0
+#else
+#define al r0
+#define ah r1
+#endif
+
+ENTRY(__ashrdi3)
+ENTRY(__aeabi_lasr)
+
+	subs	r3, r2, #32
+	rsb	ip, r2, #32
+	movmi	al, al, lsr r2
+	movpl	al, ah, asr r3
+ ARM(	orrmi	al, al, ah, lsl ip	)
+ THUMB(	lslmi	r3, ah, ip		)
+ THUMB(	orrmi	al, al, r3		)
+	mov	ah, ah, asr r2
+	mov	pc, lr
+
+ENDPROC(__ashrdi3)
+ENDPROC(__aeabi_lasr)
--- a/arch/arm/lib/backtrace.S
+++ b/arch/arm/lib/backtrace.S
@@ -0,0 +1,152 @@
+/*
+ *  linux/arch/arm/lib/backtrace.S
+ *
+ *  Copyright (C) 1995, 1996 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 27/03/03 Ian Molton Clean up CONFIG_CPU
+ *
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+		.text
+
+@ fp is 0 or stack frame
+
+#define frame	r4
+#define sv_fp	r5
+#define sv_pc	r6
+#define mask	r7
+#define offset	r8
+
+ENTRY(c_backtrace)
+
+#if !defined(CONFIG_FRAME_POINTER) || !defined(CONFIG_PRINTK)
+		mov	pc, lr
+ENDPROC(c_backtrace)
+#else
+		stmfd	sp!, {r4 - r8, lr}	@ Save an extra register so we have a location...
+		movs	frame, r0		@ if frame pointer is zero
+		beq	no_frame		@ we have no stack frames
+
+		tst	r1, #0x10		@ 26 or 32-bit mode?
+ ARM(		moveq	mask, #0xfc000003	)
+ THUMB(		moveq	mask, #0xfc000000	)
+ THUMB(		orreq	mask, #0x03		)
+		movne	mask, #0		@ mask for 32-bit
+
+1:		stmfd	sp!, {pc}		@ calculate offset of PC stored
+		ldr	r0, [sp], #4		@ by stmfd for this CPU
+		adr	r1, 1b
+		sub	offset, r0, r1
+
+/*
+ * Stack frame layout:
+ *             optionally saved caller registers (r4 - r10)
+ *             saved fp
+ *             saved sp
+ *             saved lr
+ *    frame => saved pc
+ *             optionally saved arguments (r0 - r3)
+ * saved sp => <next word>
+ *
+ * Functions start with the following code sequence:
+ *                  mov   ip, sp
+ *                  stmfd sp!, {r0 - r3} (optional)
+ * corrected pc =>  stmfd sp!, {..., fp, ip, lr, pc}
+ */
+for_each_frame:	tst	frame, mask		@ Check for address exceptions
+		bne	no_frame
+
+1001:		ldr	sv_pc, [frame, #0]	@ get saved pc
+1002:		ldr	sv_fp, [frame, #-12]	@ get saved fp
+
+		sub	sv_pc, sv_pc, offset	@ Correct PC for prefetching
+		bic	sv_pc, sv_pc, mask	@ mask PC/LR for the mode
+
+1003:		ldr	r2, [sv_pc, #-4]	@ if stmfd sp!, {args} exists,
+		ldr	r3, .Ldsi+4		@ adjust saved 'pc' back one
+		teq	r3, r2, lsr #10		@ instruction
+		subne	r0, sv_pc, #4		@ allow for mov
+		subeq	r0, sv_pc, #8		@ allow for mov + stmia
+
+		ldr	r1, [frame, #-4]	@ get saved lr
+		mov	r2, frame
+		bic	r1, r1, mask		@ mask PC/LR for the mode
+		bl	dump_backtrace_entry
+
+		ldr	r1, [sv_pc, #-4]	@ if stmfd sp!, {args} exists,
+		ldr	r3, .Ldsi+4
+		teq	r3, r1, lsr #10
+		ldreq	r0, [frame, #-8]	@ get sp
+		subeq	r0, r0, #4		@ point at the last arg
+		bleq	.Ldumpstm		@ dump saved registers
+
+1004:		ldr	r1, [sv_pc, #0]		@ if stmfd sp!, {..., fp, ip, lr, pc}
+		ldr	r3, .Ldsi		@ instruction exists,
+		teq	r3, r1, lsr #10
+		subeq	r0, frame, #16
+		bleq	.Ldumpstm		@ dump saved registers
+
+		teq	sv_fp, #0		@ zero saved fp means
+		beq	no_frame		@ no further frames
+
+		cmp	sv_fp, frame		@ next frame must be
+		mov	frame, sv_fp		@ above the current frame
+		bhi	for_each_frame
+
+1006:		adr	r0, .Lbad
+		mov	r1, frame
+		bl	printk
+no_frame:	ldmfd	sp!, {r4 - r8, pc}
+ENDPROC(c_backtrace)
+		
+		.pushsection __ex_table,"a"
+		.align	3
+		.long	1001b, 1006b
+		.long	1002b, 1006b
+		.long	1003b, 1006b
+		.long	1004b, 1006b
+		.popsection
+
+#define instr r4
+#define reg   r5
+#define stack r6
+
+.Ldumpstm:	stmfd	sp!, {instr, reg, stack, r7, lr}
+		mov	stack, r0
+		mov	instr, r1
+		mov	reg, #10
+		mov	r7, #0
+1:		mov	r3, #1
+ ARM(		tst	instr, r3, lsl reg	)
+ THUMB(		lsl	r3, reg			)
+ THUMB(		tst	instr, r3		)
+		beq	2f
+		add	r7, r7, #1
+		teq	r7, #6
+		moveq	r7, #1
+		moveq	r1, #'\n'
+		movne	r1, #' '
+		ldr	r3, [stack], #-4
+		mov	r2, reg
+		adr	r0, .Lfp
+		bl	printk
+2:		subs	reg, reg, #1
+		bpl	1b
+		teq	r7, #0
+		adrne	r0, .Lcr
+		blne	printk
+		ldmfd	sp!, {instr, reg, stack, r7, pc}
+
+.Lfp:		.asciz	"%cr%d:%08x"
+.Lcr:		.asciz	"\n"
+.Lbad:		.asciz	"Backtrace aborted due to bad frame pointer <%p>\n"
+		.align
+.Ldsi:		.word	0xe92dd800 >> 10	@ stmfd sp!, {... fp, ip, lr, pc}
+		.word	0xe92d0000 >> 10	@ stmfd sp!, {}
+
+#endif
--- a/arch/arm/lib/bitops.h
+++ b/arch/arm/lib/bitops.h
@@ -0,0 +1,95 @@
+#include <asm/unwind.h>
+
+#if __LINUX_ARM_ARCH__ >= 6
+	.macro	bitop, name, instr
+ENTRY(	\name		)
+UNWIND(	.fnstart	)
+	ands	ip, r1, #3
+	strneb	r1, [ip]		@ assert word-aligned
+	mov	r2, #1
+	and	r3, r0, #31		@ Get bit offset
+	mov	r0, r0, lsr #5
+	add	r1, r1, r0, lsl #2	@ Get word offset
+	mov	r3, r2, lsl r3
+1:	ldrex	r2, [r1]
+	\instr	r2, r2, r3
+	strex	r0, r2, [r1]
+	cmp	r0, #0
+	bne	1b
+	bx	lr
+UNWIND(	.fnend		)
+ENDPROC(\name		)
+	.endm
+
+	.macro	testop, name, instr, store
+ENTRY(	\name		)
+UNWIND(	.fnstart	)
+	ands	ip, r1, #3
+	strneb	r1, [ip]		@ assert word-aligned
+	mov	r2, #1
+	and	r3, r0, #31		@ Get bit offset
+	mov	r0, r0, lsr #5
+	add	r1, r1, r0, lsl #2	@ Get word offset
+	mov	r3, r2, lsl r3		@ create mask
+	smp_dmb
+1:	ldrex	r2, [r1]
+	ands	r0, r2, r3		@ save old value of bit
+	\instr	r2, r2, r3		@ toggle bit
+	strex	ip, r2, [r1]
+	cmp	ip, #0
+	bne	1b
+	smp_dmb
+	cmp	r0, #0
+	movne	r0, #1
+2:	bx	lr
+UNWIND(	.fnend		)
+ENDPROC(\name		)
+	.endm
+#else
+	.macro	bitop, name, instr
+ENTRY(	\name		)
+UNWIND(	.fnstart	)
+	ands	ip, r1, #3
+	strneb	r1, [ip]		@ assert word-aligned
+	and	r2, r0, #31
+	mov	r0, r0, lsr #5
+	mov	r3, #1
+	mov	r3, r3, lsl r2
+	save_and_disable_irqs ip
+	ldr	r2, [r1, r0, lsl #2]
+	\instr	r2, r2, r3
+	str	r2, [r1, r0, lsl #2]
+	restore_irqs ip
+	mov	pc, lr
+UNWIND(	.fnend		)
+ENDPROC(\name		)
+	.endm
+
+/**
+ * testop - implement a test_and_xxx_bit operation.
+ * @instr: operational instruction
+ * @store: store instruction
+ *
+ * Note: we can trivially conditionalise the store instruction
+ * to avoid dirtying the data cache.
+ */
+	.macro	testop, name, instr, store
+ENTRY(	\name		)
+UNWIND(	.fnstart	)
+	ands	ip, r1, #3
+	strneb	r1, [ip]		@ assert word-aligned
+	and	r3, r0, #31
+	mov	r0, r0, lsr #5
+	save_and_disable_irqs ip
+	ldr	r2, [r1, r0, lsl #2]!
+	mov	r0, #1
+	tst	r2, r0, lsl r3
+	\instr	r2, r2, r0, lsl r3
+	\store	r2, [r1]
+	moveq	r0, #0
+	restore_irqs ip
+	mov	pc, lr
+UNWIND(	.fnend		)
+ENDPROC(\name		)
+	.endm
+#endif
--- a/arch/arm/lib/call_with_stack.S
+++ b/arch/arm/lib/call_with_stack.S
@@ -0,0 +1,44 @@
+/*
+ * arch/arm/lib/call_with_stack.S
+ *
+ * Copyright (C) 2011 ARM Ltd.
+ * Written by Will Deacon <will.deacon@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/*
+ * void call_with_stack(void (*fn)(void *), void *arg, void *sp)
+ *
+ * Change the stack to that pointed at by sp, then invoke fn(arg) with
+ * the new stack.
+ */
+ENTRY(call_with_stack)
+	str	sp, [r2, #-4]!
+	str	lr, [r2, #-4]!
+
+	mov	sp, r2
+	mov	r2, r0
+	mov	r0, r1
+
+	adr	lr, BSYM(1f)
+	mov	pc, r2
+
+1:	ldr	lr, [sp]
+	ldr	sp, [sp, #4]
+	mov	pc, lr
+ENDPROC(call_with_stack)
--- a/arch/arm/lib/changebit.S
+++ b/arch/arm/lib/changebit.S
@@ -0,0 +1,15 @@
+/*
+ *  linux/arch/arm/lib/changebit.S
+ *
+ *  Copyright (C) 1995-1996 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include "bitops.h"
+                .text
+
+bitop	_change_bit, eor
--- a/arch/arm/lib/clear_user.S
+++ b/arch/arm/lib/clear_user.S
@@ -0,0 +1,54 @@
+/*
+ *  linux/arch/arm/lib/clear_user.S
+ *
+ *  Copyright (C) 1995, 1996,1997,1998 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+		.text
+
+/* Prototype: int __clear_user(void *addr, size_t sz)
+ * Purpose  : clear some user memory
+ * Params   : addr - user memory address to clear
+ *          : sz   - number of bytes to clear
+ * Returns  : number of bytes NOT cleared
+ */
+ENTRY(__clear_user_std)
+WEAK(__clear_user)
+		stmfd	sp!, {r1, lr}
+		mov	r2, #0
+		cmp	r1, #4
+		blt	2f
+		ands	ip, r0, #3
+		beq	1f
+		cmp	ip, #2
+		strusr	r2, r0, 1
+		strusr	r2, r0, 1, le
+		strusr	r2, r0, 1, lt
+		rsb	ip, ip, #4
+		sub	r1, r1, ip		@  7  6  5  4  3  2  1
+1:		subs	r1, r1, #8		@ -1 -2 -3 -4 -5 -6 -7
+		strusr	r2, r0, 4, pl, rept=2
+		bpl	1b
+		adds	r1, r1, #4		@  3  2  1  0 -1 -2 -3
+		strusr	r2, r0, 4, pl
+2:		tst	r1, #2			@ 1x 1x 0x 0x 1x 1x 0x
+		strusr	r2, r0, 1, ne, rept=2
+		tst	r1, #1			@ x1 x0 x1 x0 x1 x0 x1
+		it	ne			@ explicit IT needed for the label
+USER(		strnebt	r2, [r0])
+		mov	r0, #0
+		ldmfd	sp!, {r1, pc}
+ENDPROC(__clear_user)
+ENDPROC(__clear_user_std)
+
+		.pushsection .fixup,"ax"
+		.align	0
+9001:		ldmfd	sp!, {r0, pc}
+		.popsection
+
--- a/arch/arm/lib/clearbit.S
+++ b/arch/arm/lib/clearbit.S
@@ -0,0 +1,15 @@
+/*
+ *  linux/arch/arm/lib/clearbit.S
+ *
+ *  Copyright (C) 1995-1996 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include "bitops.h"
+                .text
+
+bitop	_clear_bit, bic
--- a/arch/arm/lib/copy_from_user.S
+++ b/arch/arm/lib/copy_from_user.S
@@ -0,0 +1,104 @@
+/*
+ *  linux/arch/arm/lib/copy_from_user.S
+ *
+ *  Author:	Nicolas Pitre
+ *  Created:	Sep 29, 2005
+ *  Copyright:	MontaVista Software, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/*
+ * Prototype:
+ *
+ *	size_t __copy_from_user(void *to, const void *from, size_t n)
+ *
+ * Purpose:
+ *
+ *	copy a block to kernel memory from user memory
+ *
+ * Params:
+ *
+ *	to = kernel memory
+ *	from = user memory
+ *	n = number of bytes to copy
+ *
+ * Return value:
+ *
+ *	Number of bytes NOT copied.
+ */
+
+#ifndef CONFIG_THUMB2_KERNEL
+#define LDR1W_SHIFT	0
+#else
+#define LDR1W_SHIFT	1
+#endif
+#define STR1W_SHIFT	0
+
+	.macro ldr1w ptr reg abort
+	ldrusr	\reg, \ptr, 4, abort=\abort
+	.endm
+
+	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
+	ldr1w \ptr, \reg1, \abort
+	ldr1w \ptr, \reg2, \abort
+	ldr1w \ptr, \reg3, \abort
+	ldr1w \ptr, \reg4, \abort
+	.endm
+
+	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+	ldr4w \ptr, \reg1, \reg2, \reg3, \reg4, \abort
+	ldr4w \ptr, \reg5, \reg6, \reg7, \reg8, \abort
+	.endm
+
+	.macro ldr1b ptr reg cond=al abort
+	ldrusr	\reg, \ptr, 1, \cond, abort=\abort
+	.endm
+
+	.macro str1w ptr reg abort
+	W(str) \reg, [\ptr], #4
+	.endm
+
+	.macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+	stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+	.endm
+
+	.macro str1b ptr reg cond=al abort
+	str\cond\()b \reg, [\ptr], #1
+	.endm
+
+	.macro enter reg1 reg2
+	mov	r3, #0
+	stmdb	sp!, {r0, r2, r3, \reg1, \reg2}
+	.endm
+
+	.macro exit reg1 reg2
+	add	sp, sp, #8
+	ldmfd	sp!, {r0, \reg1, \reg2}
+	.endm
+
+	.text
+
+ENTRY(__copy_from_user)
+
+#include "copy_template.S"
+
+ENDPROC(__copy_from_user)
+
+	.pushsection .fixup,"ax"
+	.align 0
+	copy_abort_preamble
+	ldmfd	sp!, {r1, r2}
+	sub	r3, r0, r1
+	rsb	r1, r3, r2
+	str	r1, [sp]
+	bl	__memzero
+	ldr	r0, [sp], #4
+	copy_abort_end
+	.popsection
+
--- a/arch/arm/lib/copy_page.S
+++ b/arch/arm/lib/copy_page.S
@@ -0,0 +1,47 @@
+/*
+ *  linux/arch/arm/lib/copypage.S
+ *
+ *  Copyright (C) 1995-1999 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  ASM optimised string functions
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/asm-offsets.h>
+#include <asm/cache.h>
+
+#define COPY_COUNT (PAGE_SZ / (2 * L1_CACHE_BYTES) PLD( -1 ))
+
+		.text
+		.align	5
+/*
+ * StrongARM optimised copy_page routine
+ * now 1.78bytes/cycle, was 1.60 bytes/cycle (50MHz bus -> 89MB/s)
+ * Note that we probably achieve closer to the 100MB/s target with
+ * the core clock switching.
+ */
+ENTRY(copy_page)
+		stmfd	sp!, {r4, lr}			@	2
+	PLD(	pld	[r1, #0]		)
+	PLD(	pld	[r1, #L1_CACHE_BYTES]		)
+		mov	r2, #COPY_COUNT			@	1
+		ldmia	r1!, {r3, r4, ip, lr}		@	4+1
+1:	PLD(	pld	[r1, #2 * L1_CACHE_BYTES])
+	PLD(	pld	[r1, #3 * L1_CACHE_BYTES])
+2:
+	.rept	(2 * L1_CACHE_BYTES / 16 - 1)
+		stmia	r0!, {r3, r4, ip, lr}		@	4
+		ldmia	r1!, {r3, r4, ip, lr}		@	4
+	.endr
+		subs	r2, r2, #1			@	1
+		stmia	r0!, {r3, r4, ip, lr}		@	4
+		ldmgtia	r1!, {r3, r4, ip, lr}		@	4
+		bgt	1b				@	1
+	PLD(	ldmeqia r1!, {r3, r4, ip, lr}	)
+	PLD(	beq	2b			)
+		ldmfd	sp!, {r4, pc}			@	3
+ENDPROC(copy_page)
--- a/arch/arm/lib/copy_template.S
+++ b/arch/arm/lib/copy_template.S
@@ -0,0 +1,267 @@
+/*
+ *  linux/arch/arm/lib/copy_template.s
+ *
+ *  Code template for optimized memory copy functions
+ *
+ *  Author:	Nicolas Pitre
+ *  Created:	Sep 28, 2005
+ *  Copyright:	MontaVista Software, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+/*
+ * Theory of operation
+ * -------------------
+ *
+ * This file provides the core code for a forward memory copy used in
+ * the implementation of memcopy(), copy_to_user() and copy_from_user().
+ *
+ * The including file must define the following accessor macros
+ * according to the need of the given function:
+ *
+ * ldr1w ptr reg abort
+ *
+ *	This loads one word from 'ptr', stores it in 'reg' and increments
+ *	'ptr' to the next word. The 'abort' argument is used for fixup tables.
+ *
+ * ldr4w ptr reg1 reg2 reg3 reg4 abort
+ * ldr8w ptr, reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+ *
+ *	This loads four or eight words starting from 'ptr', stores them
+ *	in provided registers and increments 'ptr' past those words.
+ *	The'abort' argument is used for fixup tables.
+ *
+ * ldr1b ptr reg cond abort
+ *
+ *	Similar to ldr1w, but it loads a byte and increments 'ptr' one byte.
+ *	It also must apply the condition code if provided, otherwise the
+ *	"al" condition is assumed by default.
+ *
+ * str1w ptr reg abort
+ * str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+ * str1b ptr reg cond abort
+ *
+ *	Same as their ldr* counterparts, but data is stored to 'ptr' location
+ *	rather than being loaded.
+ *
+ * enter reg1 reg2
+ *
+ *	Preserve the provided registers on the stack plus any additional
+ *	data as needed by the implementation including this code. Called
+ *	upon code entry.
+ *
+ * exit reg1 reg2
+ *
+ *	Restore registers with the values previously saved with the
+ *	'preserv' macro. Called upon code termination.
+ *
+ * LDR1W_SHIFT
+ * STR1W_SHIFT
+ *
+ *	Correction to be applied to the "ip" register when branching into
+ *	the ldr1w or str1w instructions (some of these macros may expand to
+ *	than one 32bit instruction in Thumb-2)
+ */
+
+
+		enter	r4, lr
+
+		subs	r2, r2, #4
+		blt	8f
+		ands	ip, r0, #3
+	PLD(	pld	[r1, #0]		)
+		bne	9f
+		ands	ip, r1, #3
+		bne	10f
+
+1:		subs	r2, r2, #(28)
+		stmfd	sp!, {r5 - r8}
+		blt	5f
+
+	CALGN(	ands	ip, r0, #31		)
+	CALGN(	rsb	r3, ip, #32		)
+	CALGN(	sbcnes	r4, r3, r2		)  @ C is always set here
+	CALGN(	bcs	2f			)
+	CALGN(	adr	r4, 6f			)
+	CALGN(	subs	r2, r2, r3		)  @ C gets set
+	CALGN(	add	pc, r4, ip		)
+
+	PLD(	pld	[r1, #0]		)
+2:	PLD(	subs	r2, r2, #96		)
+	PLD(	pld	[r1, #28]		)
+	PLD(	blt	4f			)
+	PLD(	pld	[r1, #60]		)
+	PLD(	pld	[r1, #92]		)
+
+3:	PLD(	pld	[r1, #124]		)
+4:		ldr8w	r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+		subs	r2, r2, #32
+		str8w	r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+		bge	3b
+	PLD(	cmn	r2, #96			)
+	PLD(	bge	4b			)
+
+5:		ands	ip, r2, #28
+		rsb	ip, ip, #32
+#if LDR1W_SHIFT > 0
+		lsl	ip, ip, #LDR1W_SHIFT
+#endif
+		addne	pc, pc, ip		@ C is always clear here
+		b	7f
+6:
+		.rept	(1 << LDR1W_SHIFT)
+		W(nop)
+		.endr
+		ldr1w	r1, r3, abort=20f
+		ldr1w	r1, r4, abort=20f
+		ldr1w	r1, r5, abort=20f
+		ldr1w	r1, r6, abort=20f
+		ldr1w	r1, r7, abort=20f
+		ldr1w	r1, r8, abort=20f
+		ldr1w	r1, lr, abort=20f
+
+#if LDR1W_SHIFT < STR1W_SHIFT
+		lsl	ip, ip, #STR1W_SHIFT - LDR1W_SHIFT
+#elif LDR1W_SHIFT > STR1W_SHIFT
+		lsr	ip, ip, #LDR1W_SHIFT - STR1W_SHIFT
+#endif
+		add	pc, pc, ip
+		nop
+		.rept	(1 << STR1W_SHIFT)
+		W(nop)
+		.endr
+		str1w	r0, r3, abort=20f
+		str1w	r0, r4, abort=20f
+		str1w	r0, r5, abort=20f
+		str1w	r0, r6, abort=20f
+		str1w	r0, r7, abort=20f
+		str1w	r0, r8, abort=20f
+		str1w	r0, lr, abort=20f
+
+	CALGN(	bcs	2b			)
+
+7:		ldmfd	sp!, {r5 - r8}
+
+8:		movs	r2, r2, lsl #31
+		ldr1b	r1, r3, ne, abort=21f
+		ldr1b	r1, r4, cs, abort=21f
+		ldr1b	r1, ip, cs, abort=21f
+		str1b	r0, r3, ne, abort=21f
+		str1b	r0, r4, cs, abort=21f
+		str1b	r0, ip, cs, abort=21f
+
+		exit	r4, pc
+
+9:		rsb	ip, ip, #4
+		cmp	ip, #2
+		ldr1b	r1, r3, gt, abort=21f
+		ldr1b	r1, r4, ge, abort=21f
+		ldr1b	r1, lr, abort=21f
+		str1b	r0, r3, gt, abort=21f
+		str1b	r0, r4, ge, abort=21f
+		subs	r2, r2, ip
+		str1b	r0, lr, abort=21f
+		blt	8b
+		ands	ip, r1, #3
+		beq	1b
+
+10:		bic	r1, r1, #3
+		cmp	ip, #2
+		ldr1w	r1, lr, abort=21f
+		beq	17f
+		bgt	18f
+
+
+		.macro	forward_copy_shift pull push
+
+		subs	r2, r2, #28
+		blt	14f
+
+	CALGN(	ands	ip, r0, #31		)
+	CALGN(	rsb	ip, ip, #32		)
+	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
+	CALGN(	subcc	r2, r2, ip		)
+	CALGN(	bcc	15f			)
+
+11:		stmfd	sp!, {r5 - r9}
+
+	PLD(	pld	[r1, #0]		)
+	PLD(	subs	r2, r2, #96		)
+	PLD(	pld	[r1, #28]		)
+	PLD(	blt	13f			)
+	PLD(	pld	[r1, #60]		)
+	PLD(	pld	[r1, #92]		)
+
+12:	PLD(	pld	[r1, #124]		)
+13:		ldr4w	r1, r4, r5, r6, r7, abort=19f
+		mov	r3, lr, pull #\pull
+		subs	r2, r2, #32
+		ldr4w	r1, r8, r9, ip, lr, abort=19f
+		orr	r3, r3, r4, push #\push
+		mov	r4, r4, pull #\pull
+		orr	r4, r4, r5, push #\push
+		mov	r5, r5, pull #\pull
+		orr	r5, r5, r6, push #\push
+		mov	r6, r6, pull #\pull
+		orr	r6, r6, r7, push #\push
+		mov	r7, r7, pull #\pull
+		orr	r7, r7, r8, push #\push
+		mov	r8, r8, pull #\pull
+		orr	r8, r8, r9, push #\push
+		mov	r9, r9, pull #\pull
+		orr	r9, r9, ip, push #\push
+		mov	ip, ip, pull #\pull
+		orr	ip, ip, lr, push #\push
+		str8w	r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f
+		bge	12b
+	PLD(	cmn	r2, #96			)
+	PLD(	bge	13b			)
+
+		ldmfd	sp!, {r5 - r9}
+
+14:		ands	ip, r2, #28
+		beq	16f
+
+15:		mov	r3, lr, pull #\pull
+		ldr1w	r1, lr, abort=21f
+		subs	ip, ip, #4
+		orr	r3, r3, lr, push #\push
+		str1w	r0, r3, abort=21f
+		bgt	15b
+	CALGN(	cmp	r2, #0			)
+	CALGN(	bge	11b			)
+
+16:		sub	r1, r1, #(\push / 8)
+		b	8b
+
+		.endm
+
+
+		forward_copy_shift	pull=8	push=24
+
+17:		forward_copy_shift	pull=16	push=16
+
+18:		forward_copy_shift	pull=24	push=8
+
+
+/*
+ * Abort preamble and completion macros.
+ * If a fixup handler is required then those macros must surround it.
+ * It is assumed that the fixup code will handle the private part of
+ * the exit macro.
+ */
+
+	.macro	copy_abort_preamble
+19:	ldmfd	sp!, {r5 - r9}
+	b	21f
+20:	ldmfd	sp!, {r5 - r8}
+21:
+	.endm
+
+	.macro	copy_abort_end
+	ldmfd	sp!, {r4, pc}
+	.endm
+
--- a/arch/arm/lib/copy_to_user.S
+++ b/arch/arm/lib/copy_to_user.S
@@ -0,0 +1,106 @@
+/*
+ *  linux/arch/arm/lib/copy_to_user.S
+ *
+ *  Author:	Nicolas Pitre
+ *  Created:	Sep 29, 2005
+ *  Copyright:	MontaVista Software, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/*
+ * Prototype:
+ *
+ *	size_t __copy_to_user(void *to, const void *from, size_t n)
+ *
+ * Purpose:
+ *
+ *	copy a block to user memory from kernel memory
+ *
+ * Params:
+ *
+ *	to = user memory
+ *	from = kernel memory
+ *	n = number of bytes to copy
+ *
+ * Return value:
+ *
+ *	Number of bytes NOT copied.
+ */
+
+#define LDR1W_SHIFT	0
+#ifndef CONFIG_THUMB2_KERNEL
+#define STR1W_SHIFT	0
+#else
+#define STR1W_SHIFT	1
+#endif
+
+	.macro ldr1w ptr reg abort
+	W(ldr) \reg, [\ptr], #4
+	.endm
+
+	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
+	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4}
+	.endm
+
+	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+	.endm
+
+	.macro ldr1b ptr reg cond=al abort
+	ldr\cond\()b \reg, [\ptr], #1
+	.endm
+
+	.macro str1w ptr reg abort
+	strusr	\reg, \ptr, 4, abort=\abort
+	.endm
+
+	.macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+	str1w \ptr, \reg1, \abort
+	str1w \ptr, \reg2, \abort
+	str1w \ptr, \reg3, \abort
+	str1w \ptr, \reg4, \abort
+	str1w \ptr, \reg5, \abort
+	str1w \ptr, \reg6, \abort
+	str1w \ptr, \reg7, \abort
+	str1w \ptr, \reg8, \abort
+	.endm
+
+	.macro str1b ptr reg cond=al abort
+	strusr	\reg, \ptr, 1, \cond, abort=\abort
+	.endm
+
+	.macro enter reg1 reg2
+	mov	r3, #0
+	stmdb	sp!, {r0, r2, r3, \reg1, \reg2}
+	.endm
+
+	.macro exit reg1 reg2
+	add	sp, sp, #8
+	ldmfd	sp!, {r0, \reg1, \reg2}
+	.endm
+
+	.text
+
+ENTRY(__copy_to_user_std)
+WEAK(__copy_to_user)
+
+#include "copy_template.S"
+
+ENDPROC(__copy_to_user)
+ENDPROC(__copy_to_user_std)
+
+	.pushsection .fixup,"ax"
+	.align 0
+	copy_abort_preamble
+	ldmfd	sp!, {r1, r2, r3}
+	sub	r0, r0, r1
+	rsb	r0, r0, r2
+	copy_abort_end
+	.popsection
+
--- a/arch/arm/lib/csumipv6.S
+++ b/arch/arm/lib/csumipv6.S
@@ -0,0 +1,33 @@
+/*
+ *  linux/arch/arm/lib/csumipv6.S
+ *
+ *  Copyright (C) 1995-1998 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+		.text
+
+ENTRY(__csum_ipv6_magic)
+		str	lr, [sp, #-4]!
+		adds	ip, r2, r3
+		ldmia	r1, {r1 - r3, lr}
+		adcs	ip, ip, r1
+		adcs	ip, ip, r2
+		adcs	ip, ip, r3
+		adcs	ip, ip, lr
+		ldmia	r0, {r0 - r3}
+		adcs	r0, ip, r0
+		adcs	r0, r0, r1
+		adcs	r0, r0, r2
+		ldr	r2, [sp, #4]
+		adcs	r0, r0, r3
+		adcs	r0, r0, r2
+		adcs	r0, r0, #0
+		ldmfd	sp!, {pc}
+ENDPROC(__csum_ipv6_magic)
+
--- a/arch/arm/lib/csumpartial.S
+++ b/arch/arm/lib/csumpartial.S
@@ -0,0 +1,142 @@
+/*
+ *  linux/arch/arm/lib/csumpartial.S
+ *
+ *  Copyright (C) 1995-1998 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+		.text
+
+/*
+ * Function: __u32 csum_partial(const char *src, int len, __u32 sum)
+ * Params  : r0 = buffer, r1 = len, r2 = checksum
+ * Returns : r0 = new checksum
+ */
+
+buf	.req	r0
+len	.req	r1
+sum	.req	r2
+td0	.req	r3
+td1	.req	r4	@ save before use
+td2	.req	r5	@ save before use
+td3	.req	lr
+
+.Lzero:		mov	r0, sum
+		add	sp, sp, #4
+		ldr	pc, [sp], #4
+
+		/*
+		 * Handle 0 to 7 bytes, with any alignment of source and
+		 * destination pointers.  Note that when we get here, C = 0
+		 */
+.Lless8:		teq	len, #0			@ check for zero count
+		beq	.Lzero
+
+		/* we must have at least one byte. */
+		tst	buf, #1			@ odd address?
+		movne	sum, sum, ror #8
+		ldrneb	td0, [buf], #1
+		subne	len, len, #1
+		adcnes	sum, sum, td0, put_byte_1
+
+.Lless4:		tst	len, #6
+		beq	.Lless8_byte
+
+		/* we are now half-word aligned */
+
+.Lless8_wordlp:
+#if __LINUX_ARM_ARCH__ >= 4
+		ldrh	td0, [buf], #2
+		sub	len, len, #2
+#else
+		ldrb	td0, [buf], #1
+		ldrb	td3, [buf], #1
+		sub	len, len, #2
+#ifndef __ARMEB__
+		orr	td0, td0, td3, lsl #8
+#else
+		orr	td0, td3, td0, lsl #8
+#endif
+#endif
+		adcs	sum, sum, td0
+		tst	len, #6
+		bne	.Lless8_wordlp
+
+.Lless8_byte:	tst	len, #1			@ odd number of bytes
+		ldrneb	td0, [buf], #1		@ include last byte
+		adcnes	sum, sum, td0, put_byte_0	@ update checksum
+
+.Ldone:		adc	r0, sum, #0		@ collect up the last carry
+		ldr	td0, [sp], #4
+		tst	td0, #1			@ check buffer alignment
+		movne	r0, r0, ror #8		@ rotate checksum by 8 bits
+		ldr	pc, [sp], #4		@ return
+
+.Lnot_aligned:	tst	buf, #1			@ odd address
+		ldrneb	td0, [buf], #1		@ make even
+		subne	len, len, #1
+		adcnes	sum, sum, td0, put_byte_1	@ update checksum
+
+		tst	buf, #2			@ 32-bit aligned?
+#if __LINUX_ARM_ARCH__ >= 4
+		ldrneh	td0, [buf], #2		@ make 32-bit aligned
+		subne	len, len, #2
+#else
+		ldrneb	td0, [buf], #1
+		ldrneb	ip, [buf], #1
+		subne	len, len, #2
+#ifndef __ARMEB__
+		orrne	td0, td0, ip, lsl #8
+#else
+		orrne	td0, ip, td0, lsl #8
+#endif
+#endif
+		adcnes	sum, sum, td0		@ update checksum
+		mov	pc, lr
+
+ENTRY(csum_partial)
+		stmfd	sp!, {buf, lr}
+		cmp	len, #8			@ Ensure that we have at least
+		blo	.Lless8			@ 8 bytes to copy.
+
+		tst	buf, #1
+		movne	sum, sum, ror #8
+
+		adds	sum, sum, #0		@ C = 0
+		tst	buf, #3			@ Test destination alignment
+		blne	.Lnot_aligned		@ align destination, return here
+
+1:		bics	ip, len, #31
+		beq	3f
+
+		stmfd	sp!, {r4 - r5}
+2:		ldmia	buf!, {td0, td1, td2, td3}
+		adcs	sum, sum, td0
+		adcs	sum, sum, td1
+		adcs	sum, sum, td2
+		adcs	sum, sum, td3
+		ldmia	buf!, {td0, td1, td2, td3}
+		adcs	sum, sum, td0
+		adcs	sum, sum, td1
+		adcs	sum, sum, td2
+		adcs	sum, sum, td3
+		sub	ip, ip, #32
+		teq	ip, #0
+		bne	2b
+		ldmfd	sp!, {r4 - r5}
+
+3:		tst	len, #0x1c		@ should not change C
+		beq	.Lless4
+
+4:		ldr	td0, [buf], #4
+		sub	len, len, #4
+		adcs	sum, sum, td0
+		tst	len, #0x1c
+		bne	4b
+		b	.Lless4
+ENDPROC(csum_partial)
--- a/arch/arm/lib/csumpartialcopy.S
+++ b/arch/arm/lib/csumpartialcopy.S
@@ -0,0 +1,53 @@
+/*
+ *  linux/arch/arm/lib/csumpartialcopy.S
+ *
+ *  Copyright (C) 1995-1998 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+		.text
+
+/* Function: __u32 csum_partial_copy_nocheck(const char *src, char *dst, int len, __u32 sum)
+ * Params  : r0 = src, r1 = dst, r2 = len, r3 = checksum
+ * Returns : r0 = new checksum
+ */
+
+		.macro	save_regs
+		stmfd	sp!, {r1, r4 - r8, lr}
+		.endm
+
+		.macro	load_regs
+		ldmfd	sp!, {r1, r4 - r8, pc}
+		.endm
+
+		.macro	load1b, reg1
+		ldrb	\reg1, [r0], #1
+		.endm
+
+		.macro	load2b, reg1, reg2
+		ldrb	\reg1, [r0], #1
+		ldrb	\reg2, [r0], #1
+		.endm
+
+		.macro	load1l, reg1
+		ldr	\reg1, [r0], #4
+		.endm
+
+		.macro	load2l, reg1, reg2
+		ldr	\reg1, [r0], #4
+		ldr	\reg2, [r0], #4
+		.endm
+
+		.macro	load4l, reg1, reg2, reg3, reg4
+		ldmia	r0!, {\reg1, \reg2, \reg3, \reg4}
+		.endm
+
+#define FN_ENTRY	ENTRY(csum_partial_copy_nocheck)
+#define FN_EXIT		ENDPROC(csum_partial_copy_nocheck)
+
+#include "csumpartialcopygeneric.S"
--- a/arch/arm/lib/csumpartialcopygeneric.S
+++ b/arch/arm/lib/csumpartialcopygeneric.S
@@ -0,0 +1,332 @@
+/*
+ *  linux/arch/arm/lib/csumpartialcopygeneric.S
+ *
+ *  Copyright (C) 1995-2001 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * unsigned int
+ * csum_partial_copy_xxx(const char *src, char *dst, int len, int sum, )
+ *  r0 = src, r1 = dst, r2 = len, r3 = sum
+ *  Returns : r0 = checksum
+ *
+ * Note that 'tst' and 'teq' preserve the carry flag.
+ */
+
+src	.req	r0
+dst	.req	r1
+len	.req	r2
+sum	.req	r3
+
+.Lzero:		mov	r0, sum
+		load_regs
+
+		/*
+		 * Align an unaligned destination pointer.  We know that
+		 * we have >= 8 bytes here, so we don't need to check
+		 * the length.  Note that the source pointer hasn't been
+		 * aligned yet.
+		 */
+.Ldst_unaligned:
+		tst	dst, #1
+		beq	.Ldst_16bit
+
+		load1b	ip
+		sub	len, len, #1
+		adcs	sum, sum, ip, put_byte_1	@ update checksum
+		strb	ip, [dst], #1
+		tst	dst, #2
+		moveq	pc, lr			@ dst is now 32bit aligned
+
+.Ldst_16bit:	load2b	r8, ip
+		sub	len, len, #2
+		adcs	sum, sum, r8, put_byte_0
+		strb	r8, [dst], #1
+		adcs	sum, sum, ip, put_byte_1
+		strb	ip, [dst], #1
+		mov	pc, lr			@ dst is now 32bit aligned
+
+		/*
+		 * Handle 0 to 7 bytes, with any alignment of source and
+		 * destination pointers.  Note that when we get here, C = 0
+		 */
+.Lless8:	teq	len, #0			@ check for zero count
+		beq	.Lzero
+
+		/* we must have at least one byte. */
+		tst	dst, #1			@ dst 16-bit aligned
+		beq	.Lless8_aligned
+
+		/* Align dst */
+		load1b	ip
+		sub	len, len, #1
+		adcs	sum, sum, ip, put_byte_1	@ update checksum
+		strb	ip, [dst], #1
+		tst	len, #6
+		beq	.Lless8_byteonly
+
+1:		load2b	r8, ip
+		sub	len, len, #2
+		adcs	sum, sum, r8, put_byte_0
+		strb	r8, [dst], #1
+		adcs	sum, sum, ip, put_byte_1
+		strb	ip, [dst], #1
+.Lless8_aligned:
+		tst	len, #6
+		bne	1b
+.Lless8_byteonly:
+		tst	len, #1
+		beq	.Ldone
+		load1b	r8
+		adcs	sum, sum, r8, put_byte_0	@ update checksum
+		strb	r8, [dst], #1
+		b	.Ldone
+
+FN_ENTRY
+		save_regs
+
+		cmp	len, #8			@ Ensure that we have at least
+		blo	.Lless8			@ 8 bytes to copy.
+
+		adds	sum, sum, #0		@ C = 0
+		tst	dst, #3			@ Test destination alignment
+		blne	.Ldst_unaligned		@ align destination, return here
+
+		/*
+		 * Ok, the dst pointer is now 32bit aligned, and we know
+		 * that we must have more than 4 bytes to copy.  Note
+		 * that C contains the carry from the dst alignment above.
+		 */
+
+		tst	src, #3			@ Test source alignment
+		bne	.Lsrc_not_aligned
+
+		/* Routine for src & dst aligned */
+
+		bics	ip, len, #15
+		beq	2f
+
+1:		load4l	r4, r5, r6, r7
+		stmia	dst!, {r4, r5, r6, r7}
+		adcs	sum, sum, r4
+		adcs	sum, sum, r5
+		adcs	sum, sum, r6
+		adcs	sum, sum, r7
+		sub	ip, ip, #16
+		teq	ip, #0
+		bne	1b
+
+2:		ands	ip, len, #12
+		beq	4f
+		tst	ip, #8
+		beq	3f
+		load2l	r4, r5
+		stmia	dst!, {r4, r5}
+		adcs	sum, sum, r4
+		adcs	sum, sum, r5
+		tst	ip, #4
+		beq	4f
+
+3:		load1l	r4
+		str	r4, [dst], #4
+		adcs	sum, sum, r4
+
+4:		ands	len, len, #3
+		beq	.Ldone
+		load1l	r4
+		tst	len, #2
+		mov	r5, r4, get_byte_0
+		beq	.Lexit
+		adcs	sum, sum, r4, push #16
+		strb	r5, [dst], #1
+		mov	r5, r4, get_byte_1
+		strb	r5, [dst], #1
+		mov	r5, r4, get_byte_2
+.Lexit:		tst	len, #1
+		strneb	r5, [dst], #1
+		andne	r5, r5, #255
+		adcnes	sum, sum, r5, put_byte_0
+
+		/*
+		 * If the dst pointer was not 16-bit aligned, we
+		 * need to rotate the checksum here to get around
+		 * the inefficient byte manipulations in the
+		 * architecture independent code.
+		 */
+.Ldone:		adc	r0, sum, #0
+		ldr	sum, [sp, #0]		@ dst
+		tst	sum, #1
+		movne	r0, r0, ror #8
+		load_regs
+
+.Lsrc_not_aligned:
+		adc	sum, sum, #0		@ include C from dst alignment
+		and	ip, src, #3
+		bic	src, src, #3
+		load1l	r5
+		cmp	ip, #2
+		beq	.Lsrc2_aligned
+		bhi	.Lsrc3_aligned
+		mov	r4, r5, pull #8		@ C = 0
+		bics	ip, len, #15
+		beq	2f
+1:		load4l	r5, r6, r7, r8
+		orr	r4, r4, r5, push #24
+		mov	r5, r5, pull #8
+		orr	r5, r5, r6, push #24
+		mov	r6, r6, pull #8
+		orr	r6, r6, r7, push #24
+		mov	r7, r7, pull #8
+		orr	r7, r7, r8, push #24
+		stmia	dst!, {r4, r5, r6, r7}
+		adcs	sum, sum, r4
+		adcs	sum, sum, r5
+		adcs	sum, sum, r6
+		adcs	sum, sum, r7
+		mov	r4, r8, pull #8
+		sub	ip, ip, #16
+		teq	ip, #0
+		bne	1b
+2:		ands	ip, len, #12
+		beq	4f
+		tst	ip, #8
+		beq	3f
+		load2l	r5, r6
+		orr	r4, r4, r5, push #24
+		mov	r5, r5, pull #8
+		orr	r5, r5, r6, push #24
+		stmia	dst!, {r4, r5}
+		adcs	sum, sum, r4
+		adcs	sum, sum, r5
+		mov	r4, r6, pull #8
+		tst	ip, #4
+		beq	4f
+3:		load1l	r5
+		orr	r4, r4, r5, push #24
+		str	r4, [dst], #4
+		adcs	sum, sum, r4
+		mov	r4, r5, pull #8
+4:		ands	len, len, #3
+		beq	.Ldone
+		mov	r5, r4, get_byte_0
+		tst	len, #2
+		beq	.Lexit
+		adcs	sum, sum, r4, push #16
+		strb	r5, [dst], #1
+		mov	r5, r4, get_byte_1
+		strb	r5, [dst], #1
+		mov	r5, r4, get_byte_2
+		b	.Lexit
+
+.Lsrc2_aligned:	mov	r4, r5, pull #16
+		adds	sum, sum, #0
+		bics	ip, len, #15
+		beq	2f
+1:		load4l	r5, r6, r7, r8
+		orr	r4, r4, r5, push #16
+		mov	r5, r5, pull #16
+		orr	r5, r5, r6, push #16
+		mov	r6, r6, pull #16
+		orr	r6, r6, r7, push #16
+		mov	r7, r7, pull #16
+		orr	r7, r7, r8, push #16
+		stmia	dst!, {r4, r5, r6, r7}
+		adcs	sum, sum, r4
+		adcs	sum, sum, r5
+		adcs	sum, sum, r6
+		adcs	sum, sum, r7
+		mov	r4, r8, pull #16
+		sub	ip, ip, #16
+		teq	ip, #0
+		bne	1b
+2:		ands	ip, len, #12
+		beq	4f
+		tst	ip, #8
+		beq	3f
+		load2l	r5, r6
+		orr	r4, r4, r5, push #16
+		mov	r5, r5, pull #16
+		orr	r5, r5, r6, push #16
+		stmia	dst!, {r4, r5}
+		adcs	sum, sum, r4
+		adcs	sum, sum, r5
+		mov	r4, r6, pull #16
+		tst	ip, #4
+		beq	4f
+3:		load1l	r5
+		orr	r4, r4, r5, push #16
+		str	r4, [dst], #4
+		adcs	sum, sum, r4
+		mov	r4, r5, pull #16
+4:		ands	len, len, #3
+		beq	.Ldone
+		mov	r5, r4, get_byte_0
+		tst	len, #2
+		beq	.Lexit
+		adcs	sum, sum, r4
+		strb	r5, [dst], #1
+		mov	r5, r4, get_byte_1
+		strb	r5, [dst], #1
+		tst	len, #1
+		beq	.Ldone
+		load1b	r5
+		b	.Lexit
+
+.Lsrc3_aligned:	mov	r4, r5, pull #24
+		adds	sum, sum, #0
+		bics	ip, len, #15
+		beq	2f
+1:		load4l	r5, r6, r7, r8
+		orr	r4, r4, r5, push #8
+		mov	r5, r5, pull #24
+		orr	r5, r5, r6, push #8
+		mov	r6, r6, pull #24
+		orr	r6, r6, r7, push #8
+		mov	r7, r7, pull #24
+		orr	r7, r7, r8, push #8
+		stmia	dst!, {r4, r5, r6, r7}
+		adcs	sum, sum, r4
+		adcs	sum, sum, r5
+		adcs	sum, sum, r6
+		adcs	sum, sum, r7
+		mov	r4, r8, pull #24
+		sub	ip, ip, #16
+		teq	ip, #0
+		bne	1b
+2:		ands	ip, len, #12
+		beq	4f
+		tst	ip, #8
+		beq	3f
+		load2l	r5, r6
+		orr	r4, r4, r5, push #8
+		mov	r5, r5, pull #24
+		orr	r5, r5, r6, push #8
+		stmia	dst!, {r4, r5}
+		adcs	sum, sum, r4
+		adcs	sum, sum, r5
+		mov	r4, r6, pull #24
+		tst	ip, #4
+		beq	4f
+3:		load1l	r5
+		orr	r4, r4, r5, push #8
+		str	r4, [dst], #4
+		adcs	sum, sum, r4
+		mov	r4, r5, pull #24
+4:		ands	len, len, #3
+		beq	.Ldone
+		mov	r5, r4, get_byte_0
+		tst	len, #2
+		beq	.Lexit
+		strb	r5, [dst], #1
+		adcs	sum, sum, r4
+		load1l	r4
+		mov	r5, r4, get_byte_0
+		strb	r5, [dst], #1
+		adcs	sum, sum, r4, push #24
+		mov	r5, r4, get_byte_1
+		b	.Lexit
+FN_EXIT
--- a/arch/arm/lib/csumpartialcopyuser.S
+++ b/arch/arm/lib/csumpartialcopyuser.S
@@ -0,0 +1,83 @@
+/*
+ *  linux/arch/arm/lib/csumpartialcopyuser.S
+ *
+ *  Copyright (C) 1995-1998 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 27/03/03 Ian Molton Clean up CONFIG_CPU
+ *
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/errno.h>
+#include <asm/asm-offsets.h>
+
+		.text
+
+		.macro	save_regs
+		stmfd	sp!, {r1, r2, r4 - r8, lr}
+		.endm
+
+		.macro	load_regs
+		ldmfd	sp!, {r1, r2, r4 - r8, pc}
+		.endm
+
+		.macro	load1b,	reg1
+		ldrusr	\reg1, r0, 1
+		.endm
+
+		.macro	load2b, reg1, reg2
+		ldrusr	\reg1, r0, 1
+		ldrusr	\reg2, r0, 1
+		.endm
+
+		.macro	load1l, reg1
+		ldrusr	\reg1, r0, 4
+		.endm
+
+		.macro	load2l, reg1, reg2
+		ldrusr	\reg1, r0, 4
+		ldrusr	\reg2, r0, 4
+		.endm
+
+		.macro	load4l, reg1, reg2, reg3, reg4
+		ldrusr	\reg1, r0, 4
+		ldrusr	\reg2, r0, 4
+		ldrusr	\reg3, r0, 4
+		ldrusr	\reg4, r0, 4
+		.endm
+
+/*
+ * unsigned int
+ * csum_partial_copy_from_user(const char *src, char *dst, int len, int sum, int *err_ptr)
+ *  r0 = src, r1 = dst, r2 = len, r3 = sum, [sp] = *err_ptr
+ *  Returns : r0 = checksum, [[sp, #0], #0] = 0 or -EFAULT
+ */
+
+#define FN_ENTRY	ENTRY(csum_partial_copy_from_user)
+#define FN_EXIT		ENDPROC(csum_partial_copy_from_user)
+
+#include "csumpartialcopygeneric.S"
+
+/*
+ * FIXME: minor buglet here
+ * We don't return the checksum for the data present in the buffer.  To do
+ * so properly, we would have to add in whatever registers were loaded before
+ * the fault, which, with the current asm above is not predictable.
+ */
+		.pushsection .fixup,"ax"
+		.align	4
+9001:		mov	r4, #-EFAULT
+		ldr	r5, [sp, #8*4]		@ *err_ptr
+		str	r4, [r5]
+		ldmia	sp, {r1, r2}		@ retrieve dst, len
+		add	r2, r2, r1
+		mov	r0, #0			@ zero the buffer
+9002:		teq	r2, r1
+		strneb	r0, [r1], #1
+		bne	9002b
+		load_regs
+		.popsection
--- a/arch/arm/lib/delay-loop.S
+++ b/arch/arm/lib/delay-loop.S
@@ -0,0 +1,67 @@
+/*
+ *  linux/arch/arm/lib/delay.S
+ *
+ *  Copyright (C) 1995, 1996 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/delay.h>
+		.text
+
+.LC0:		.word	loops_per_jiffy
+.LC1:		.word	UDELAY_MULT
+
+/*
+ * r0  <= 2000
+ * lpj <= 0x01ffffff (max. 3355 bogomips)
+ * HZ  <= 1000
+ */
+
+ENTRY(__loop_udelay)
+		ldr	r2, .LC1
+		mul	r0, r2, r0
+ENTRY(__loop_const_udelay)			@ 0 <= r0 <= 0x7fffff06
+		mov	r1, #-1
+		ldr	r2, .LC0
+		ldr	r2, [r2]		@ max = 0x01ffffff
+		add	r0, r0, r1, lsr #32-14
+		mov	r0, r0, lsr #14		@ max = 0x0001ffff
+		add	r2, r2, r1, lsr #32-10
+		mov	r2, r2, lsr #10		@ max = 0x00007fff
+		mul	r0, r2, r0		@ max = 2^32-1
+		add	r0, r0, r1, lsr #32-6
+		movs	r0, r0, lsr #6
+		moveq	pc, lr
+
+/*
+ * loops = r0 * HZ * loops_per_jiffy / 1000000
+ */
+
+@ Delay routine
+ENTRY(__loop_delay)
+		subs	r0, r0, #1
+#if 0
+		movls	pc, lr
+		subs	r0, r0, #1
+		movls	pc, lr
+		subs	r0, r0, #1
+		movls	pc, lr
+		subs	r0, r0, #1
+		movls	pc, lr
+		subs	r0, r0, #1
+		movls	pc, lr
+		subs	r0, r0, #1
+		movls	pc, lr
+		subs	r0, r0, #1
+		movls	pc, lr
+		subs	r0, r0, #1
+#endif
+		bhi	__loop_delay
+		mov	pc, lr
+ENDPROC(__loop_udelay)
+ENDPROC(__loop_const_udelay)
+ENDPROC(__loop_delay)
--- a/arch/arm/lib/delay.c
+++ b/arch/arm/lib/delay.c
@@ -0,0 +1,93 @@
+/*
+ * Delay loops based on the OpenRISC implementation.
+ *
+ * Copyright (C) 2012 ARM Limited
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Will Deacon <will.deacon@arm.com>
+ */
+
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/timex.h>
+
+/*
+ * Default to the loop-based delay implementation.
+ */
+struct arm_delay_ops arm_delay_ops = {
+	.delay		= __loop_delay,
+	.const_udelay	= __loop_const_udelay,
+	.udelay		= __loop_udelay,
+};
+
+static const struct delay_timer *delay_timer;
+static bool delay_calibrated;
+
+int read_current_timer(unsigned long *timer_val)
+{
+	if (!delay_timer)
+		return -ENXIO;
+
+	*timer_val = delay_timer->read_current_timer();
+	return 0;
+}
+EXPORT_SYMBOL_GPL(read_current_timer);
+
+static void __timer_delay(unsigned long cycles)
+{
+	cycles_t start = get_cycles();
+
+	while ((get_cycles() - start) < cycles)
+		cpu_relax();
+}
+
+static void __timer_const_udelay(unsigned long xloops)
+{
+	unsigned long long loops = xloops;
+	loops *= arm_delay_ops.ticks_per_jiffy;
+	__timer_delay(loops >> UDELAY_SHIFT);
+}
+
+static void __timer_udelay(unsigned long usecs)
+{
+	__timer_const_udelay(usecs * UDELAY_MULT);
+}
+
+void __init register_current_timer_delay(const struct delay_timer *timer)
+{
+	if (!delay_calibrated) {
+		pr_info("Switching to timer-based delay loop\n");
+		delay_timer			= timer;
+		lpj_fine			= timer->freq / HZ;
+
+		/* cpufreq may scale loops_per_jiffy, so keep a private copy */
+		arm_delay_ops.ticks_per_jiffy	= lpj_fine;
+		arm_delay_ops.delay		= __timer_delay;
+		arm_delay_ops.const_udelay	= __timer_const_udelay;
+		arm_delay_ops.udelay		= __timer_udelay;
+
+		delay_calibrated		= true;
+	} else {
+		pr_info("Ignoring duplicate/late registration of read_current_timer delay\n");
+	}
+}
+
+unsigned long __cpuinit calibrate_delay_is_known(void)
+{
+	delay_calibrated = true;
+	return lpj_fine;
+}
--- a/arch/arm/lib/div64.S
+++ b/arch/arm/lib/div64.S
@@ -0,0 +1,211 @@
+/*
+ *  linux/arch/arm/lib/div64.S
+ *
+ *  Optimized computation of 64-bit dividend / 32-bit divisor
+ *
+ *  Author:	Nicolas Pitre
+ *  Created:	Oct 5, 2003
+ *  Copyright:	Monta Vista Software, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/unwind.h>
+
+#ifdef __ARMEB__
+#define xh r0
+#define xl r1
+#define yh r2
+#define yl r3
+#else
+#define xl r0
+#define xh r1
+#define yl r2
+#define yh r3
+#endif
+
+/*
+ * __do_div64: perform a division with 64-bit dividend and 32-bit divisor.
+ *
+ * Note: Calling convention is totally non standard for optimal code.
+ *       This is meant to be used by do_div() from include/asm/div64.h only.
+ *
+ * Input parameters:
+ * 	xh-xl	= dividend (clobbered)
+ * 	r4	= divisor (preserved)
+ *
+ * Output values:
+ * 	yh-yl	= result
+ * 	xh	= remainder
+ *
+ * Clobbered regs: xl, ip
+ */
+
+ENTRY(__do_div64)
+UNWIND(.fnstart)
+
+	@ Test for easy paths first.
+	subs	ip, r4, #1
+	bls	9f			@ divisor is 0 or 1
+	tst	ip, r4
+	beq	8f			@ divisor is power of 2
+
+	@ See if we need to handle upper 32-bit result.
+	cmp	xh, r4
+	mov	yh, #0
+	blo	3f
+
+	@ Align divisor with upper part of dividend.
+	@ The aligned divisor is stored in yl preserving the original.
+	@ The bit position is stored in ip.
+
+#if __LINUX_ARM_ARCH__ >= 5
+
+	clz	yl, r4
+	clz	ip, xh
+	sub	yl, yl, ip
+	mov	ip, #1
+	mov	ip, ip, lsl yl
+	mov	yl, r4, lsl yl
+
+#else
+
+	mov	yl, r4
+	mov	ip, #1
+1:	cmp	yl, #0x80000000
+	cmpcc	yl, xh
+	movcc	yl, yl, lsl #1
+	movcc	ip, ip, lsl #1
+	bcc	1b
+
+#endif
+
+	@ The division loop for needed upper bit positions.
+ 	@ Break out early if dividend reaches 0.
+2:	cmp	xh, yl
+	orrcs	yh, yh, ip
+	subcss	xh, xh, yl
+	movnes	ip, ip, lsr #1
+	mov	yl, yl, lsr #1
+	bne	2b
+
+	@ See if we need to handle lower 32-bit result.
+3:	cmp	xh, #0
+	mov	yl, #0
+	cmpeq	xl, r4
+	movlo	xh, xl
+	movlo	pc, lr
+
+	@ The division loop for lower bit positions.
+	@ Here we shift remainer bits leftwards rather than moving the
+	@ divisor for comparisons, considering the carry-out bit as well.
+	mov	ip, #0x80000000
+4:	movs	xl, xl, lsl #1
+	adcs	xh, xh, xh
+	beq	6f
+	cmpcc	xh, r4
+5:	orrcs	yl, yl, ip
+	subcs	xh, xh, r4
+	movs	ip, ip, lsr #1
+	bne	4b
+	mov	pc, lr
+
+	@ The top part of remainder became zero.  If carry is set
+	@ (the 33th bit) this is a false positive so resume the loop.
+	@ Otherwise, if lower part is also null then we are done.
+6:	bcs	5b
+	cmp	xl, #0
+	moveq	pc, lr
+
+	@ We still have remainer bits in the low part.  Bring them up.
+
+#if __LINUX_ARM_ARCH__ >= 5
+
+	clz	xh, xl			@ we know xh is zero here so...
+	add	xh, xh, #1
+	mov	xl, xl, lsl xh
+	mov	ip, ip, lsr xh
+
+#else
+
+7:	movs	xl, xl, lsl #1
+	mov	ip, ip, lsr #1
+	bcc	7b
+
+#endif
+
+	@ Current remainder is now 1.  It is worthless to compare with
+	@ divisor at this point since divisor can not be smaller than 3 here.
+	@ If possible, branch for another shift in the division loop.
+	@ If no bit position left then we are done.
+	movs	ip, ip, lsr #1
+	mov	xh, #1
+	bne	4b
+	mov	pc, lr
+
+8:	@ Division by a power of 2: determine what that divisor order is
+	@ then simply shift values around
+
+#if __LINUX_ARM_ARCH__ >= 5
+
+	clz	ip, r4
+	rsb	ip, ip, #31
+
+#else
+
+	mov	yl, r4
+	cmp	r4, #(1 << 16)
+	mov	ip, #0
+	movhs	yl, yl, lsr #16
+	movhs	ip, #16
+
+	cmp	yl, #(1 << 8)
+	movhs	yl, yl, lsr #8
+	addhs	ip, ip, #8
+
+	cmp	yl, #(1 << 4)
+	movhs	yl, yl, lsr #4
+	addhs	ip, ip, #4
+
+	cmp	yl, #(1 << 2)
+	addhi	ip, ip, #3
+	addls	ip, ip, yl, lsr #1
+
+#endif
+
+	mov	yh, xh, lsr ip
+	mov	yl, xl, lsr ip
+	rsb	ip, ip, #32
+ ARM(	orr	yl, yl, xh, lsl ip	)
+ THUMB(	lsl	xh, xh, ip		)
+ THUMB(	orr	yl, yl, xh		)
+	mov	xh, xl, lsl ip
+	mov	xh, xh, lsr ip
+	mov	pc, lr
+
+	@ eq -> division by 1: obvious enough...
+9:	moveq	yl, xl
+	moveq	yh, xh
+	moveq	xh, #0
+	moveq	pc, lr
+UNWIND(.fnend)
+
+UNWIND(.fnstart)
+UNWIND(.pad #4)
+UNWIND(.save {lr})
+Ldiv0_64:
+	@ Division by 0:
+	str	lr, [sp, #-8]!
+	bl	__div0
+
+	@ as wrong as it could be...
+	mov	yl, #0
+	mov	yh, #0
+	mov	xh, #0
+	ldr	pc, [sp], #8
+
+UNWIND(.fnend)
+ENDPROC(__do_div64)
--- a/arch/arm/lib/ecard.S
+++ b/arch/arm/lib/ecard.S
@@ -0,0 +1,44 @@
+/*
+ *  linux/arch/arm/lib/ecard.S
+ *
+ *  Copyright (C) 1995, 1996 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 27/03/03 Ian Molton Clean up CONFIG_CPU
+ *
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+#define CPSR2SPSR(rt) \
+		mrs	rt, cpsr; \
+		msr	spsr_cxsf, rt
+
+@ Purpose: call an expansion card loader to read bytes.
+@ Proto  : char read_loader(int offset, char *card_base, char *loader);
+@ Returns: byte read
+
+ENTRY(ecard_loader_read)
+		stmfd	sp!, {r4 - r12, lr}
+		mov	r11, r1
+		mov	r1, r0
+		CPSR2SPSR(r0)
+		mov	lr, pc
+		mov	pc, r2
+		ldmfd	sp!, {r4 - r12, pc}
+
+@ Purpose: call an expansion card loader to reset the card
+@ Proto  : void read_loader(int card_base, char *loader);
+@ Returns: byte read
+
+ENTRY(ecard_loader_reset)
+		stmfd	sp!, {r4 - r12, lr}
+		mov	r11, r0
+		CPSR2SPSR(r0)
+		mov	lr, pc
+		add	pc, r1, #8
+		ldmfd	sp!, {r4 - r12, pc}
+
--- a/arch/arm/lib/findbit.S
+++ b/arch/arm/lib/findbit.S
@@ -0,0 +1,196 @@
+/*
+ *  linux/arch/arm/lib/findbit.S
+ *
+ *  Copyright (C) 1995-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 16th March 2001 - John Ripley <jripley@sonicblue.com>
+ *   Fixed so that "size" is an exclusive not an inclusive quantity.
+ *   All users of these functions expect exclusive sizes, and may
+ *   also call with zero size.
+ * Reworked by rmk.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+                .text
+
+/*
+ * Purpose  : Find a 'zero' bit
+ * Prototype: int find_first_zero_bit(void *addr, unsigned int maxbit);
+ */
+ENTRY(_find_first_zero_bit_le)
+		teq	r1, #0	
+		beq	3f
+		mov	r2, #0
+1:
+ ARM(		ldrb	r3, [r0, r2, lsr #3]	)
+ THUMB(		lsr	r3, r2, #3		)
+ THUMB(		ldrb	r3, [r0, r3]		)
+		eors	r3, r3, #0xff		@ invert bits
+		bne	.L_found		@ any now set - found zero bit
+		add	r2, r2, #8		@ next bit pointer
+2:		cmp	r2, r1			@ any more?
+		blo	1b
+3:		mov	r0, r1			@ no free bits
+		mov	pc, lr
+ENDPROC(_find_first_zero_bit_le)
+
+/*
+ * Purpose  : Find next 'zero' bit
+ * Prototype: int find_next_zero_bit(void *addr, unsigned int maxbit, int offset)
+ */
+ENTRY(_find_next_zero_bit_le)
+		teq	r1, #0
+		beq	3b
+		ands	ip, r2, #7
+		beq	1b			@ If new byte, goto old routine
+ ARM(		ldrb	r3, [r0, r2, lsr #3]	)
+ THUMB(		lsr	r3, r2, #3		)
+ THUMB(		ldrb	r3, [r0, r3]		)
+		eor	r3, r3, #0xff		@ now looking for a 1 bit
+		movs	r3, r3, lsr ip		@ shift off unused bits
+		bne	.L_found
+		orr	r2, r2, #7		@ if zero, then no bits here
+		add	r2, r2, #1		@ align bit pointer
+		b	2b			@ loop for next bit
+ENDPROC(_find_next_zero_bit_le)
+
+/*
+ * Purpose  : Find a 'one' bit
+ * Prototype: int find_first_bit(const unsigned long *addr, unsigned int maxbit);
+ */
+ENTRY(_find_first_bit_le)
+		teq	r1, #0	
+		beq	3f
+		mov	r2, #0
+1:
+ ARM(		ldrb	r3, [r0, r2, lsr #3]	)
+ THUMB(		lsr	r3, r2, #3		)
+ THUMB(		ldrb	r3, [r0, r3]		)
+		movs	r3, r3
+		bne	.L_found		@ any now set - found zero bit
+		add	r2, r2, #8		@ next bit pointer
+2:		cmp	r2, r1			@ any more?
+		blo	1b
+3:		mov	r0, r1			@ no free bits
+		mov	pc, lr
+ENDPROC(_find_first_bit_le)
+
+/*
+ * Purpose  : Find next 'one' bit
+ * Prototype: int find_next_zero_bit(void *addr, unsigned int maxbit, int offset)
+ */
+ENTRY(_find_next_bit_le)
+		teq	r1, #0
+		beq	3b
+		ands	ip, r2, #7
+		beq	1b			@ If new byte, goto old routine
+ ARM(		ldrb	r3, [r0, r2, lsr #3]	)
+ THUMB(		lsr	r3, r2, #3		)
+ THUMB(		ldrb	r3, [r0, r3]		)
+		movs	r3, r3, lsr ip		@ shift off unused bits
+		bne	.L_found
+		orr	r2, r2, #7		@ if zero, then no bits here
+		add	r2, r2, #1		@ align bit pointer
+		b	2b			@ loop for next bit
+ENDPROC(_find_next_bit_le)
+
+#ifdef __ARMEB__
+
+ENTRY(_find_first_zero_bit_be)
+		teq	r1, #0
+		beq	3f
+		mov	r2, #0
+1:		eor	r3, r2, #0x18		@ big endian byte ordering
+ ARM(		ldrb	r3, [r0, r3, lsr #3]	)
+ THUMB(		lsr	r3, #3			)
+ THUMB(		ldrb	r3, [r0, r3]		)
+		eors	r3, r3, #0xff		@ invert bits
+		bne	.L_found		@ any now set - found zero bit
+		add	r2, r2, #8		@ next bit pointer
+2:		cmp	r2, r1			@ any more?
+		blo	1b
+3:		mov	r0, r1			@ no free bits
+		mov	pc, lr
+ENDPROC(_find_first_zero_bit_be)
+
+ENTRY(_find_next_zero_bit_be)
+		teq	r1, #0
+		beq	3b
+		ands	ip, r2, #7
+		beq	1b			@ If new byte, goto old routine
+		eor	r3, r2, #0x18		@ big endian byte ordering
+ ARM(		ldrb	r3, [r0, r3, lsr #3]	)
+ THUMB(		lsr	r3, #3			)
+ THUMB(		ldrb	r3, [r0, r3]		)
+		eor	r3, r3, #0xff		@ now looking for a 1 bit
+		movs	r3, r3, lsr ip		@ shift off unused bits
+		bne	.L_found
+		orr	r2, r2, #7		@ if zero, then no bits here
+		add	r2, r2, #1		@ align bit pointer
+		b	2b			@ loop for next bit
+ENDPROC(_find_next_zero_bit_be)
+
+ENTRY(_find_first_bit_be)
+		teq	r1, #0
+		beq	3f
+		mov	r2, #0
+1:		eor	r3, r2, #0x18		@ big endian byte ordering
+ ARM(		ldrb	r3, [r0, r3, lsr #3]	)
+ THUMB(		lsr	r3, #3			)
+ THUMB(		ldrb	r3, [r0, r3]		)
+		movs	r3, r3
+		bne	.L_found		@ any now set - found zero bit
+		add	r2, r2, #8		@ next bit pointer
+2:		cmp	r2, r1			@ any more?
+		blo	1b
+3:		mov	r0, r1			@ no free bits
+		mov	pc, lr
+ENDPROC(_find_first_bit_be)
+
+ENTRY(_find_next_bit_be)
+		teq	r1, #0
+		beq	3b
+		ands	ip, r2, #7
+		beq	1b			@ If new byte, goto old routine
+		eor	r3, r2, #0x18		@ big endian byte ordering
+ ARM(		ldrb	r3, [r0, r3, lsr #3]	)
+ THUMB(		lsr	r3, #3			)
+ THUMB(		ldrb	r3, [r0, r3]		)
+		movs	r3, r3, lsr ip		@ shift off unused bits
+		bne	.L_found
+		orr	r2, r2, #7		@ if zero, then no bits here
+		add	r2, r2, #1		@ align bit pointer
+		b	2b			@ loop for next bit
+ENDPROC(_find_next_bit_be)
+
+#endif
+
+/*
+ * One or more bits in the LSB of r3 are assumed to be set.
+ */
+.L_found:
+#if __LINUX_ARM_ARCH__ >= 5
+		rsb	r0, r3, #0
+		and	r3, r3, r0
+		clz	r3, r3
+		rsb	r3, r3, #31
+		add	r0, r2, r3
+#else
+		tst	r3, #0x0f
+		addeq	r2, r2, #4
+		movne	r3, r3, lsl #4
+		tst	r3, #0x30
+		addeq	r2, r2, #2
+		movne	r3, r3, lsl #2
+		tst	r3, #0x40
+		addeq	r2, r2, #1
+		mov	r0, r2
+#endif
+		cmp	r1, r0			@ Clamp to maxbit
+		movlo	r0, r1
+		mov	pc, lr
+
--- a/arch/arm/lib/floppydma.S
+++ b/arch/arm/lib/floppydma.S
@@ -0,0 +1,32 @@
+/*
+ *  linux/arch/arm/lib/floppydma.S
+ *
+ *  Copyright (C) 1995, 1996 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+		.text
+
+		.global	floppy_fiqin_end
+ENTRY(floppy_fiqin_start)
+		subs	r9, r9, #1
+		ldrgtb	r12, [r11, #-4]
+		ldrleb	r12, [r11], #0
+		strb	r12, [r10], #1
+		subs	pc, lr, #4
+floppy_fiqin_end:
+
+		.global	floppy_fiqout_end
+ENTRY(floppy_fiqout_start)
+		subs	r9, r9, #1
+		ldrgeb	r12, [r10], #1
+		movlt	r12, #0
+		strleb	r12, [r11], #0
+		subles	pc, lr, #4
+		strb	r12, [r11, #-4]
+		subs	pc, lr, #4
+floppy_fiqout_end:
--- a/arch/arm/lib/getuser.S
+++ b/arch/arm/lib/getuser.S
@@ -0,0 +1,80 @@
+/*
+ *  linux/arch/arm/lib/getuser.S
+ *
+ *  Copyright (C) 2001 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  Idea from x86 version, (C) Copyright 1998 Linus Torvalds
+ *
+ * These functions have a non-standard call interface to make them more
+ * efficient, especially as they return an error value in addition to
+ * the "real" return value.
+ *
+ * __get_user_X
+ *
+ * Inputs:	r0 contains the address
+ *		r1 contains the address limit, which must be preserved
+ * Outputs:	r0 is the error code
+ *		r2 contains the zero-extended value
+ *		lr corrupted
+ *
+ * No other registers must be altered.  (see <asm/uaccess.h>
+ * for specific ASM register usage).
+ *
+ * Note that ADDR_LIMIT is either 0 or 0xc0000000.
+ * Note also that it is intended that __get_user_bad is not global.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/errno.h>
+#include <asm/domain.h>
+
+ENTRY(__get_user_1)
+	check_uaccess r0, 1, r1, r2, __get_user_bad
+1: TUSER(ldrb)	r2, [r0]
+	mov	r0, #0
+	mov	pc, lr
+ENDPROC(__get_user_1)
+
+ENTRY(__get_user_2)
+	check_uaccess r0, 2, r1, r2, __get_user_bad
+#ifdef CONFIG_CPU_USE_DOMAINS
+rb	.req	ip
+2:	ldrbt	r2, [r0], #1
+3:	ldrbt	rb, [r0], #0
+#else
+rb	.req	r0
+2:	ldrb	r2, [r0]
+3:	ldrb	rb, [r0, #1]
+#endif
+#ifndef __ARMEB__
+	orr	r2, r2, rb, lsl #8
+#else
+	orr	r2, rb, r2, lsl #8
+#endif
+	mov	r0, #0
+	mov	pc, lr
+ENDPROC(__get_user_2)
+
+ENTRY(__get_user_4)
+	check_uaccess r0, 4, r1, r2, __get_user_bad
+4: TUSER(ldr)	r2, [r0]
+	mov	r0, #0
+	mov	pc, lr
+ENDPROC(__get_user_4)
+
+__get_user_bad:
+	mov	r2, #0
+	mov	r0, #-EFAULT
+	mov	pc, lr
+ENDPROC(__get_user_bad)
+
+.pushsection __ex_table, "a"
+	.long	1b, __get_user_bad
+	.long	2b, __get_user_bad
+	.long	3b, __get_user_bad
+	.long	4b, __get_user_bad
+.popsection
--- a/arch/arm/lib/io-acorn.S
+++ b/arch/arm/lib/io-acorn.S
@@ -0,0 +1,32 @@
+/*
+ *  linux/arch/arm/lib/io-acorn.S
+ *
+ *  Copyright (C) 1995, 1996 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 27/03/03 Ian Molton Clean up CONFIG_CPU
+ *
+ */
+#include <linux/linkage.h>
+#include <linux/kern_levels.h>
+#include <asm/assembler.h>
+
+		.text
+		.align
+
+.Liosl_warning:
+		.ascii	KERN_WARNING "insl/outsl not implemented, called from %08lX\0"
+		.align
+
+/*
+ * These make no sense on Acorn machines.
+ * Print a warning message.
+ */
+ENTRY(insl)
+ENTRY(outsl)
+		adr	r0, .Liosl_warning
+		mov	r1, lr
+		b	printk
--- a/arch/arm/lib/io-readsb.S
+++ b/arch/arm/lib/io-readsb.S
@@ -0,0 +1,123 @@
+/*
+ *  linux/arch/arm/lib/io-readsb.S
+ *
+ *  Copyright (C) 1995-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+.Linsb_align:	rsb	ip, ip, #4
+		cmp	ip, r2
+		movgt	ip, r2
+		cmp	ip, #2
+		ldrb	r3, [r0]
+		strb	r3, [r1], #1
+		ldrgeb	r3, [r0]
+		strgeb	r3, [r1], #1
+		ldrgtb	r3, [r0]
+		strgtb	r3, [r1], #1
+		subs	r2, r2, ip
+		bne	.Linsb_aligned
+
+ENTRY(__raw_readsb)
+		teq	r2, #0		@ do we have to check for the zero len?
+		moveq	pc, lr
+		ands	ip, r1, #3
+		bne	.Linsb_align
+
+.Linsb_aligned:	stmfd	sp!, {r4 - r6, lr}
+
+		subs	r2, r2, #16
+		bmi	.Linsb_no_16
+
+.Linsb_16_lp:	ldrb	r3, [r0]
+		ldrb	r4, [r0]
+		ldrb	r5, [r0]
+		mov	r3, r3,     put_byte_0
+		ldrb	r6, [r0]
+		orr	r3, r3, r4, put_byte_1
+		ldrb	r4, [r0]
+		orr	r3, r3, r5, put_byte_2
+		ldrb	r5, [r0]
+		orr	r3, r3, r6, put_byte_3
+		ldrb	r6, [r0]
+		mov	r4, r4,     put_byte_0
+		ldrb	ip, [r0]
+		orr	r4, r4, r5, put_byte_1
+		ldrb	r5, [r0]
+		orr	r4, r4, r6, put_byte_2
+		ldrb	r6, [r0]
+		orr	r4, r4, ip, put_byte_3
+		ldrb	ip, [r0]
+		mov	r5, r5,     put_byte_0
+		ldrb	lr, [r0]
+		orr	r5, r5, r6, put_byte_1
+		ldrb	r6, [r0]
+		orr	r5, r5, ip, put_byte_2
+		ldrb	ip, [r0]
+		orr	r5, r5, lr, put_byte_3
+		ldrb	lr, [r0]
+		mov	r6, r6,     put_byte_0
+		orr	r6, r6, ip, put_byte_1
+		ldrb	ip, [r0]
+		orr	r6, r6, lr, put_byte_2
+		orr	r6, r6, ip, put_byte_3
+		stmia	r1!, {r3 - r6}
+
+		subs	r2, r2, #16
+		bpl	.Linsb_16_lp
+
+		tst	r2, #15
+		ldmeqfd	sp!, {r4 - r6, pc}
+
+.Linsb_no_16:	tst	r2, #8
+		beq	.Linsb_no_8
+
+		ldrb	r3, [r0]
+		ldrb	r4, [r0]
+		ldrb	r5, [r0]
+		mov	r3, r3,     put_byte_0
+		ldrb	r6, [r0]
+		orr	r3, r3, r4, put_byte_1
+		ldrb	r4, [r0]
+		orr	r3, r3, r5, put_byte_2
+		ldrb	r5, [r0]
+		orr	r3, r3, r6, put_byte_3
+		ldrb	r6, [r0]
+		mov	r4, r4,     put_byte_0
+		ldrb	ip, [r0]
+		orr	r4, r4, r5, put_byte_1
+		orr	r4, r4, r6, put_byte_2
+		orr	r4, r4, ip, put_byte_3
+		stmia	r1!, {r3, r4}
+
+.Linsb_no_8:	tst	r2, #4
+		beq	.Linsb_no_4
+
+		ldrb	r3, [r0]
+		ldrb	r4, [r0]
+		ldrb	r5, [r0]
+		ldrb	r6, [r0]
+		mov	r3, r3,     put_byte_0
+		orr	r3, r3, r4, put_byte_1
+		orr	r3, r3, r5, put_byte_2
+		orr	r3, r3, r6, put_byte_3
+		str	r3, [r1], #4
+
+.Linsb_no_4:	ands	r2, r2, #3
+		ldmeqfd	sp!, {r4 - r6, pc}
+
+		cmp	r2, #2
+		ldrb	r3, [r0]
+		strb	r3, [r1], #1
+		ldrgeb	r3, [r0]
+		strgeb	r3, [r1], #1
+		ldrgtb	r3, [r0]
+		strgtb	r3, [r1]
+
+		ldmfd	sp!, {r4 - r6, pc}
+ENDPROC(__raw_readsb)
--- a/arch/arm/lib/io-readsl.S
+++ b/arch/arm/lib/io-readsl.S
@@ -0,0 +1,79 @@
+/*
+ *  linux/arch/arm/lib/io-readsl.S
+ *
+ *  Copyright (C) 1995-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ENTRY(__raw_readsl)
+		teq	r2, #0		@ do we have to check for the zero len?
+		moveq	pc, lr
+		ands	ip, r1, #3
+		bne	3f
+
+		subs	r2, r2, #4
+		bmi	2f
+		stmfd	sp!, {r4, lr}
+1:		ldr	r3, [r0, #0]
+		ldr	r4, [r0, #0]
+		ldr	ip, [r0, #0]
+		ldr	lr, [r0, #0]
+		subs	r2, r2, #4
+		stmia	r1!, {r3, r4, ip, lr}
+		bpl	1b
+		ldmfd	sp!, {r4, lr}
+2:		movs	r2, r2, lsl #31
+		ldrcs	r3, [r0, #0]
+		ldrcs	ip, [r0, #0]
+		stmcsia	r1!, {r3, ip}
+		ldrne	r3, [r0, #0]
+		strne	r3, [r1, #0]
+		mov	pc, lr
+
+3:		ldr	r3, [r0]
+		cmp	ip, #2
+		mov	ip, r3, get_byte_0
+		strb	ip, [r1], #1
+		bgt	6f
+		mov	ip, r3, get_byte_1
+		strb	ip, [r1], #1
+		beq	5f
+		mov	ip, r3, get_byte_2
+		strb	ip, [r1], #1
+
+4:		subs	r2, r2, #1
+		mov	ip, r3, pull #24
+		ldrne	r3, [r0]
+		orrne	ip, ip, r3, push #8
+		strne	ip, [r1], #4
+		bne	4b
+		b	8f
+
+5:		subs	r2, r2, #1
+		mov	ip, r3, pull #16
+		ldrne	r3, [r0]
+		orrne	ip, ip, r3, push #16
+		strne	ip, [r1], #4
+		bne	5b
+		b	7f
+
+6:		subs	r2, r2, #1
+		mov	ip, r3, pull #8
+		ldrne	r3, [r0]
+		orrne	ip, ip, r3, push #24
+		strne	ip, [r1], #4
+		bne	6b
+
+		mov	r3, ip, get_byte_2
+		strb	r3, [r1, #2]
+7:		mov	r3, ip, get_byte_1
+		strb	r3, [r1, #1]
+8:		mov	r3, ip, get_byte_0
+		strb	r3, [r1, #0]
+		mov	pc, lr
+ENDPROC(__raw_readsl)
--- a/arch/arm/lib/io-readsw-armv3.S
+++ b/arch/arm/lib/io-readsw-armv3.S
@@ -0,0 +1,106 @@
+/*
+ *  linux/arch/arm/lib/io-readsw-armv3.S
+ *
+ *  Copyright (C) 1995-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+.Linsw_bad_alignment:
+		adr	r0, .Linsw_bad_align_msg
+		mov	r2, lr
+		b	panic
+.Linsw_bad_align_msg:
+		.asciz	"insw: bad buffer alignment (0x%p, lr=0x%08lX)\n"
+		.align
+
+.Linsw_align:	tst	r1, #1
+		bne	.Linsw_bad_alignment
+
+		ldr	r3, [r0]
+		strb	r3, [r1], #1
+		mov	r3, r3, lsr #8
+		strb	r3, [r1], #1
+
+		subs	r2, r2, #1
+		moveq	pc, lr
+
+ENTRY(__raw_readsw)
+		teq	r2, #0		@ do we have to check for the zero len?
+		moveq	pc, lr
+		tst	r1, #3
+		bne	.Linsw_align
+
+.Linsw_aligned:	mov	ip, #0xff
+		orr	ip, ip, ip, lsl #8
+		stmfd	sp!, {r4, r5, r6, lr}
+
+		subs	r2, r2, #8
+		bmi	.Lno_insw_8
+
+.Linsw_8_lp:	ldr	r3, [r0]
+		and	r3, r3, ip
+		ldr	r4, [r0]
+		orr	r3, r3, r4, lsl #16
+
+		ldr	r4, [r0]
+		and	r4, r4, ip
+		ldr	r5, [r0]
+		orr	r4, r4, r5, lsl #16
+
+		ldr	r5, [r0]
+		and	r5, r5, ip
+		ldr	r6, [r0]
+		orr	r5, r5, r6, lsl #16
+
+		ldr	r6, [r0]
+		and	r6, r6, ip
+		ldr	lr, [r0]
+		orr	r6, r6, lr, lsl #16
+
+		stmia	r1!, {r3 - r6}
+
+		subs	r2, r2, #8
+		bpl	.Linsw_8_lp
+
+		tst	r2, #7
+		ldmeqfd	sp!, {r4, r5, r6, pc}
+
+.Lno_insw_8:	tst	r2, #4
+		beq	.Lno_insw_4
+
+		ldr	r3, [r0]
+		and	r3, r3, ip
+		ldr	r4, [r0]
+		orr	r3, r3, r4, lsl #16
+
+		ldr	r4, [r0]
+		and	r4, r4, ip
+		ldr	r5, [r0]
+		orr	r4, r4, r5, lsl #16
+
+		stmia	r1!, {r3, r4}
+
+.Lno_insw_4:	tst	r2, #2
+		beq	.Lno_insw_2
+
+		ldr	r3, [r0]
+		and	r3, r3, ip
+		ldr	r4, [r0]
+		orr	r3, r3, r4, lsl #16
+
+		str	r3, [r1], #4
+
+.Lno_insw_2:	tst	r2, #1
+		ldrne	r3, [r0]
+		strneb	r3, [r1], #1
+		movne	r3, r3, lsr #8
+		strneb	r3, [r1]
+
+		ldmfd	sp!, {r4, r5, r6, pc}
+
+
--- a/arch/arm/lib/io-readsw-armv4.S
+++ b/arch/arm/lib/io-readsw-armv4.S
@@ -0,0 +1,131 @@
+/*
+ *  linux/arch/arm/lib/io-readsw-armv4.S
+ *
+ *  Copyright (C) 1995-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+		.macro	pack, rd, hw1, hw2
+#ifndef __ARMEB__
+		orr	\rd, \hw1, \hw2, lsl #16
+#else
+		orr	\rd, \hw2, \hw1, lsl #16
+#endif
+		.endm
+
+.Linsw_align:	movs	ip, r1, lsl #31
+		bne	.Linsw_noalign
+		ldrh	ip, [r0]
+		sub	r2, r2, #1
+		strh	ip, [r1], #2
+
+ENTRY(__raw_readsw)
+		teq	r2, #0
+		moveq	pc, lr
+		tst	r1, #3
+		bne	.Linsw_align
+
+		stmfd	sp!, {r4, r5, lr}
+
+		subs	r2, r2, #8
+		bmi	.Lno_insw_8
+
+.Linsw_8_lp:	ldrh	r3, [r0]
+		ldrh	r4, [r0]
+		pack	r3, r3, r4
+
+		ldrh	r4, [r0]
+		ldrh	r5, [r0]
+		pack	r4, r4, r5
+
+		ldrh	r5, [r0]
+		ldrh	ip, [r0]
+		pack	r5, r5, ip
+
+		ldrh	ip, [r0]
+		ldrh	lr, [r0]
+		pack	ip, ip, lr
+
+		subs	r2, r2, #8
+		stmia	r1!, {r3 - r5, ip}
+		bpl	.Linsw_8_lp
+
+.Lno_insw_8:	tst	r2, #4
+		beq	.Lno_insw_4
+
+		ldrh	r3, [r0]
+		ldrh	r4, [r0]
+		pack	r3, r3, r4
+
+		ldrh	r4, [r0]
+		ldrh	ip, [r0]
+		pack	r4, r4, ip
+
+		stmia	r1!, {r3, r4}
+
+.Lno_insw_4:	movs	r2, r2, lsl #31
+		bcc	.Lno_insw_2
+
+		ldrh	r3, [r0]
+		ldrh	ip, [r0]
+		pack	r3, r3, ip
+		str	r3, [r1], #4
+
+.Lno_insw_2:	ldrneh	r3, [r0]
+		strneh	r3, [r1]
+
+		ldmfd	sp!, {r4, r5, pc}
+
+#ifdef __ARMEB__
+#define _BE_ONLY_(code...)	code
+#define _LE_ONLY_(code...)
+#define push_hbyte0		lsr #8
+#define pull_hbyte1		lsl #24
+#else
+#define _BE_ONLY_(code...)
+#define _LE_ONLY_(code...) code
+#define push_hbyte0		lsl #24
+#define pull_hbyte1		lsr #8
+#endif
+
+.Linsw_noalign:	stmfd	sp!, {r4, lr}
+		ldrccb	ip, [r1, #-1]!
+		bcc	1f
+
+		ldrh	ip, [r0]
+		sub	r2, r2, #1
+   _BE_ONLY_(	mov	ip, ip, ror #8		)
+		strb	ip, [r1], #1
+   _LE_ONLY_(	mov	ip, ip, lsr #8		)
+   _BE_ONLY_(	mov	ip, ip, lsr #24		)
+
+1:		subs	r2, r2, #2
+		bmi	3f
+   _BE_ONLY_(	mov	ip, ip, lsl #24		)
+
+2:		ldrh	r3, [r0]
+		ldrh	r4, [r0]
+		subs	r2, r2, #2
+		orr	ip, ip, r3, lsl #8
+		orr	ip, ip, r4, push_hbyte0
+		str	ip, [r1], #4
+		mov	ip, r4, pull_hbyte1
+		bpl	2b
+
+   _BE_ONLY_(	mov	ip, ip, lsr #24		)
+
+3:		tst	r2, #1
+		strb	ip, [r1], #1
+		ldrneh	ip, [r0]
+   _BE_ONLY_(	movne	ip, ip, ror #8		)
+		strneb	ip, [r1], #1
+   _LE_ONLY_(	movne	ip, ip, lsr #8		)
+   _BE_ONLY_(	movne	ip, ip, lsr #24		)
+		strneb	ip, [r1]
+		ldmfd	sp!, {r4, pc}
+ENDPROC(__raw_readsw)
--- a/arch/arm/lib/io-shark.c
+++ b/arch/arm/lib/io-shark.c
@@ -0,0 +1,13 @@
+/*
+ *  linux/arch/arm/lib/io-shark.c
+ *
+ *  by Alexander Schulz
+ *
+ * derived from:
+ * linux/arch/arm/lib/io-ebsa.S
+ * Copyright (C) 1995, 1996 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
--- a/arch/arm/lib/io-writesb.S
+++ b/arch/arm/lib/io-writesb.S
@@ -0,0 +1,94 @@
+/*
+ *  linux/arch/arm/lib/io-writesb.S
+ *
+ *  Copyright (C) 1995-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+		.macro	outword, rd
+#ifndef __ARMEB__
+		strb	\rd, [r0]
+		mov	\rd, \rd, lsr #8
+		strb	\rd, [r0]
+		mov	\rd, \rd, lsr #8
+		strb	\rd, [r0]
+		mov	\rd, \rd, lsr #8
+		strb	\rd, [r0]
+#else
+		mov	lr, \rd, lsr #24
+		strb	lr, [r0]
+		mov	lr, \rd, lsr #16
+		strb	lr, [r0]
+		mov	lr, \rd, lsr #8
+		strb	lr, [r0]
+		strb	\rd, [r0]
+#endif
+		.endm
+
+.Loutsb_align:	rsb	ip, ip, #4
+		cmp	ip, r2
+		movgt	ip, r2
+		cmp	ip, #2
+		ldrb	r3, [r1], #1
+		strb	r3, [r0]
+		ldrgeb	r3, [r1], #1
+		strgeb	r3, [r0]
+		ldrgtb	r3, [r1], #1
+		strgtb	r3, [r0]
+		subs	r2, r2, ip
+		bne	.Loutsb_aligned
+
+ENTRY(__raw_writesb)
+		teq	r2, #0		@ do we have to check for the zero len?
+		moveq	pc, lr
+		ands	ip, r1, #3
+		bne	.Loutsb_align
+
+.Loutsb_aligned:
+		stmfd	sp!, {r4, r5, lr}
+
+		subs	r2, r2, #16
+		bmi	.Loutsb_no_16
+
+.Loutsb_16_lp:	ldmia	r1!, {r3, r4, r5, ip}
+		outword	r3
+		outword	r4
+		outword	r5
+		outword	ip
+		subs	r2, r2, #16
+		bpl	.Loutsb_16_lp
+
+		tst	r2, #15
+		ldmeqfd	sp!, {r4, r5, pc}
+
+.Loutsb_no_16:	tst	r2, #8
+		beq	.Loutsb_no_8
+
+		ldmia	r1!, {r3, r4}
+		outword	r3
+		outword	r4
+
+.Loutsb_no_8:	tst	r2, #4
+		beq	.Loutsb_no_4
+
+		ldr	r3, [r1], #4
+		outword	r3
+
+.Loutsb_no_4:	ands	r2, r2, #3
+		ldmeqfd	sp!, {r4, r5, pc}
+
+		cmp	r2, #2
+		ldrb	r3, [r1], #1
+		strb	r3, [r0]
+		ldrgeb	r3, [r1], #1
+		strgeb	r3, [r0]
+		ldrgtb	r3, [r1]
+		strgtb	r3, [r0]
+
+		ldmfd	sp!, {r4, r5, pc}
+ENDPROC(__raw_writesb)
--- a/arch/arm/lib/io-writesl.S
+++ b/arch/arm/lib/io-writesl.S
@@ -0,0 +1,67 @@
+/*
+ *  linux/arch/arm/lib/io-writesl.S
+ *
+ *  Copyright (C) 1995-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ENTRY(__raw_writesl)
+		teq	r2, #0		@ do we have to check for the zero len?
+		moveq	pc, lr
+		ands	ip, r1, #3
+		bne	3f
+
+		subs	r2, r2, #4
+		bmi	2f
+		stmfd	sp!, {r4, lr}
+1:		ldmia	r1!, {r3, r4, ip, lr}
+		subs	r2, r2, #4
+		str	r3, [r0, #0]
+		str	r4, [r0, #0]
+		str	ip, [r0, #0]
+		str	lr, [r0, #0]
+		bpl	1b
+		ldmfd	sp!, {r4, lr}
+2:		movs	r2, r2, lsl #31
+		ldmcsia	r1!, {r3, ip}
+		strcs	r3, [r0, #0]
+		ldrne	r3, [r1, #0]
+		strcs	ip, [r0, #0]
+		strne	r3, [r0, #0]
+		mov	pc, lr
+
+3:		bic	r1, r1, #3
+		ldr	r3, [r1], #4
+		cmp	ip, #2
+		blt	5f
+		bgt	6f
+
+4:		mov	ip, r3, pull #16
+		ldr	r3, [r1], #4
+		subs	r2, r2, #1
+		orr	ip, ip, r3, push #16
+		str	ip, [r0]
+		bne	4b
+		mov	pc, lr
+
+5:		mov	ip, r3, pull #8
+		ldr	r3, [r1], #4
+		subs	r2, r2, #1
+		orr	ip, ip, r3, push #24
+		str	ip, [r0]
+		bne	5b
+		mov	pc, lr
+
+6:		mov	ip, r3, pull #24
+		ldr	r3, [r1], #4
+		subs	r2, r2, #1
+		orr	ip, ip, r3, push #8
+		str	ip, [r0]
+		bne	6b
+		mov	pc, lr
+ENDPROC(__raw_writesl)
--- a/arch/arm/lib/io-writesw-armv3.S
+++ b/arch/arm/lib/io-writesw-armv3.S
@@ -0,0 +1,126 @@
+/*
+ *  linux/arch/arm/lib/io-writesw-armv3.S
+ *
+ *  Copyright (C) 1995-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+.Loutsw_bad_alignment:
+		adr	r0, .Loutsw_bad_align_msg
+		mov	r2, lr
+		b	panic
+.Loutsw_bad_align_msg:
+		.asciz	"outsw: bad buffer alignment (0x%p, lr=0x%08lX)\n"
+		.align
+
+.Loutsw_align:	tst	r1, #1
+		bne	.Loutsw_bad_alignment
+
+		add	r1, r1, #2
+
+		ldr	r3, [r1, #-4]
+		mov	r3, r3, lsr #16
+		orr	r3, r3, r3, lsl #16
+		str	r3, [r0]
+		subs	r2, r2, #1
+		moveq	pc, lr
+
+ENTRY(__raw_writesw)
+		teq	r2, #0		@ do we have to check for the zero len?
+		moveq	pc, lr
+		tst	r1, #3
+		bne	.Loutsw_align
+
+		stmfd	sp!, {r4, r5, r6, lr}
+
+		subs	r2, r2, #8
+		bmi	.Lno_outsw_8
+
+.Loutsw_8_lp:	ldmia	r1!, {r3, r4, r5, r6}
+
+		mov	ip, r3, lsl #16
+		orr	ip, ip, ip, lsr #16
+		str	ip, [r0]
+
+		mov	ip, r3, lsr #16
+		orr	ip, ip, ip, lsl #16
+		str	ip, [r0]
+
+		mov	ip, r4, lsl #16
+		orr	ip, ip, ip, lsr #16
+		str	ip, [r0]
+
+		mov	ip, r4, lsr #16
+		orr	ip, ip, ip, lsl #16
+		str	ip, [r0]
+
+		mov	ip, r5, lsl #16
+		orr	ip, ip, ip, lsr #16
+		str	ip, [r0]
+
+		mov	ip, r5, lsr #16
+		orr	ip, ip, ip, lsl #16
+		str	ip, [r0]
+
+		mov	ip, r6, lsl #16
+		orr	ip, ip, ip, lsr #16
+		str	ip, [r0]
+
+		mov	ip, r6, lsr #16
+		orr	ip, ip, ip, lsl #16
+		str	ip, [r0]
+
+		subs	r2, r2, #8
+		bpl	.Loutsw_8_lp
+
+		tst	r2, #7
+		ldmeqfd	sp!, {r4, r5, r6, pc}
+
+.Lno_outsw_8:	tst	r2, #4
+		beq	.Lno_outsw_4
+
+		ldmia	r1!, {r3, r4}
+
+		mov	ip, r3, lsl #16
+		orr	ip, ip, ip, lsr #16
+		str	ip, [r0]
+
+		mov	ip, r3, lsr #16
+		orr	ip, ip, ip, lsl #16
+		str	ip, [r0]
+
+		mov	ip, r4, lsl #16
+		orr	ip, ip, ip, lsr #16
+		str	ip, [r0]
+
+		mov	ip, r4, lsr #16
+		orr	ip, ip, ip, lsl #16
+		str	ip, [r0]
+
+.Lno_outsw_4:	tst	r2, #2
+		beq	.Lno_outsw_2
+
+		ldr	r3, [r1], #4
+
+		mov	ip, r3, lsl #16
+		orr	ip, ip, ip, lsr #16
+		str	ip, [r0]
+
+		mov	ip, r3, lsr #16
+		orr	ip, ip, ip, lsl #16
+		str	ip, [r0]
+
+.Lno_outsw_2:	tst	r2, #1
+
+		ldrne	r3, [r1]
+
+		movne	ip, r3, lsl #16
+		orrne	ip, ip, ip, lsr #16
+		strne	ip, [r0]
+
+		ldmfd	sp!, {r4, r5, r6, pc}
--- a/arch/arm/lib/io-writesw-armv4.S
+++ b/arch/arm/lib/io-writesw-armv4.S
@@ -0,0 +1,100 @@
+/*
+ *  linux/arch/arm/lib/io-writesw-armv4.S
+ *
+ *  Copyright (C) 1995-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+		.macro	outword, rd
+#ifndef __ARMEB__
+		strh	\rd, [r0]
+		mov	\rd, \rd, lsr #16
+		strh	\rd, [r0]
+#else
+		mov	lr, \rd, lsr #16
+		strh	lr, [r0]
+		strh	\rd, [r0]
+#endif
+		.endm
+
+.Loutsw_align:	movs	ip, r1, lsl #31
+		bne	.Loutsw_noalign
+
+		ldrh	r3, [r1], #2
+		sub	r2, r2, #1
+		strh	r3, [r0]
+
+ENTRY(__raw_writesw)
+		teq	r2, #0
+		moveq	pc, lr
+		ands	r3, r1, #3
+		bne	.Loutsw_align
+
+		stmfd	sp!, {r4, r5, lr}
+
+		subs	r2, r2, #8
+		bmi	.Lno_outsw_8
+
+.Loutsw_8_lp:	ldmia	r1!, {r3, r4, r5, ip}
+		subs	r2, r2, #8
+		outword	r3
+		outword	r4
+		outword	r5
+		outword	ip
+		bpl	.Loutsw_8_lp
+
+.Lno_outsw_8:	tst	r2, #4
+		beq	.Lno_outsw_4
+
+		ldmia	r1!, {r3, ip}
+		outword	r3
+		outword	ip
+
+.Lno_outsw_4:	movs	r2, r2, lsl #31
+		bcc	.Lno_outsw_2
+
+		ldr	r3, [r1], #4
+		outword	r3
+
+.Lno_outsw_2:	ldrneh	r3, [r1]
+		strneh	r3, [r0]
+
+		ldmfd	sp!, {r4, r5, pc}
+
+#ifdef __ARMEB__
+#define pull_hbyte0	lsl #8
+#define push_hbyte1	lsr #24
+#else
+#define pull_hbyte0	lsr #24
+#define push_hbyte1	lsl #8
+#endif
+
+.Loutsw_noalign:
+ ARM(		ldr	r3, [r1, -r3]!	)
+ THUMB(		rsb	r3, r3, #0	)
+ THUMB(		ldr	r3, [r1, r3]	)
+ THUMB(		sub	r1, r3		)
+		subcs	r2, r2, #1
+		bcs	2f
+		subs	r2, r2, #2
+		bmi	3f
+
+1:		mov	ip, r3, lsr #8
+		strh	ip, [r0]
+2:		mov	ip, r3, pull_hbyte0
+		ldr	r3, [r1, #4]!
+		subs	r2, r2, #2
+		orr	ip, ip, r3, push_hbyte1
+		strh	ip, [r0]
+		bpl	1b
+
+		tst	r2, #1
+3:		movne	ip, r3, lsr #8
+		strneh	ip, [r0]
+		mov	pc, lr
+ENDPROC(__raw_writesw)
--- a/arch/arm/lib/lib1funcs.S
+++ b/arch/arm/lib/lib1funcs.S
@@ -0,0 +1,363 @@
+/*
+ * linux/arch/arm/lib/lib1funcs.S: Optimized ARM division routines
+ *
+ * Author: Nicolas Pitre <nico@fluxnic.net>
+ *   - contributed to gcc-3.4 on Sep 30, 2003
+ *   - adapted for the Linux kernel on Oct 2, 2003
+ */
+
+/* Copyright 1995, 1996, 1998, 1999, 2000, 2003 Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+In addition to the permissions in the GNU General Public License, the
+Free Software Foundation gives you unlimited permission to link the
+compiled version of this file into combinations with other programs,
+and to distribute those combinations without any restriction coming
+from the use of this file.  (The General Public License restrictions
+do apply in other respects; for example, they cover modification of
+the file, and distribution when not linked into a combine
+executable.)
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; see the file COPYING.  If not, write to
+the Free Software Foundation, 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/unwind.h>
+
+.macro ARM_DIV_BODY dividend, divisor, result, curbit
+
+#if __LINUX_ARM_ARCH__ >= 5
+
+	clz	\curbit, \divisor
+	clz	\result, \dividend
+	sub	\result, \curbit, \result
+	mov	\curbit, #1
+	mov	\divisor, \divisor, lsl \result
+	mov	\curbit, \curbit, lsl \result
+	mov	\result, #0
+	
+#else
+
+	@ Initially shift the divisor left 3 bits if possible,
+	@ set curbit accordingly.  This allows for curbit to be located
+	@ at the left end of each 4 bit nibbles in the division loop
+	@ to save one loop in most cases.
+	tst	\divisor, #0xe0000000
+	moveq	\divisor, \divisor, lsl #3
+	moveq	\curbit, #8
+	movne	\curbit, #1
+
+	@ Unless the divisor is very big, shift it up in multiples of
+	@ four bits, since this is the amount of unwinding in the main
+	@ division loop.  Continue shifting until the divisor is 
+	@ larger than the dividend.
+1:	cmp	\divisor, #0x10000000
+	cmplo	\divisor, \dividend
+	movlo	\divisor, \divisor, lsl #4
+	movlo	\curbit, \curbit, lsl #4
+	blo	1b
+
+	@ For very big divisors, we must shift it a bit at a time, or
+	@ we will be in danger of overflowing.
+1:	cmp	\divisor, #0x80000000
+	cmplo	\divisor, \dividend
+	movlo	\divisor, \divisor, lsl #1
+	movlo	\curbit, \curbit, lsl #1
+	blo	1b
+
+	mov	\result, #0
+
+#endif
+
+	@ Division loop
+1:	cmp	\dividend, \divisor
+	subhs	\dividend, \dividend, \divisor
+	orrhs	\result,   \result,   \curbit
+	cmp	\dividend, \divisor,  lsr #1
+	subhs	\dividend, \dividend, \divisor, lsr #1
+	orrhs	\result,   \result,   \curbit,  lsr #1
+	cmp	\dividend, \divisor,  lsr #2
+	subhs	\dividend, \dividend, \divisor, lsr #2
+	orrhs	\result,   \result,   \curbit,  lsr #2
+	cmp	\dividend, \divisor,  lsr #3
+	subhs	\dividend, \dividend, \divisor, lsr #3
+	orrhs	\result,   \result,   \curbit,  lsr #3
+	cmp	\dividend, #0			@ Early termination?
+	movnes	\curbit,   \curbit,  lsr #4	@ No, any more bits to do?
+	movne	\divisor,  \divisor, lsr #4
+	bne	1b
+
+.endm
+
+
+.macro ARM_DIV2_ORDER divisor, order
+
+#if __LINUX_ARM_ARCH__ >= 5
+
+	clz	\order, \divisor
+	rsb	\order, \order, #31
+
+#else
+
+	cmp	\divisor, #(1 << 16)
+	movhs	\divisor, \divisor, lsr #16
+	movhs	\order, #16
+	movlo	\order, #0
+
+	cmp	\divisor, #(1 << 8)
+	movhs	\divisor, \divisor, lsr #8
+	addhs	\order, \order, #8
+
+	cmp	\divisor, #(1 << 4)
+	movhs	\divisor, \divisor, lsr #4
+	addhs	\order, \order, #4
+
+	cmp	\divisor, #(1 << 2)
+	addhi	\order, \order, #3
+	addls	\order, \order, \divisor, lsr #1
+
+#endif
+
+.endm
+
+
+.macro ARM_MOD_BODY dividend, divisor, order, spare
+
+#if __LINUX_ARM_ARCH__ >= 5
+
+	clz	\order, \divisor
+	clz	\spare, \dividend
+	sub	\order, \order, \spare
+	mov	\divisor, \divisor, lsl \order
+
+#else
+
+	mov	\order, #0
+
+	@ Unless the divisor is very big, shift it up in multiples of
+	@ four bits, since this is the amount of unwinding in the main
+	@ division loop.  Continue shifting until the divisor is 
+	@ larger than the dividend.
+1:	cmp	\divisor, #0x10000000
+	cmplo	\divisor, \dividend
+	movlo	\divisor, \divisor, lsl #4
+	addlo	\order, \order, #4
+	blo	1b
+
+	@ For very big divisors, we must shift it a bit at a time, or
+	@ we will be in danger of overflowing.
+1:	cmp	\divisor, #0x80000000
+	cmplo	\divisor, \dividend
+	movlo	\divisor, \divisor, lsl #1
+	addlo	\order, \order, #1
+	blo	1b
+
+#endif
+
+	@ Perform all needed substractions to keep only the reminder.
+	@ Do comparisons in batch of 4 first.
+	subs	\order, \order, #3		@ yes, 3 is intended here
+	blt	2f
+
+1:	cmp	\dividend, \divisor
+	subhs	\dividend, \dividend, \divisor
+	cmp	\dividend, \divisor,  lsr #1
+	subhs	\dividend, \dividend, \divisor, lsr #1
+	cmp	\dividend, \divisor,  lsr #2
+	subhs	\dividend, \dividend, \divisor, lsr #2
+	cmp	\dividend, \divisor,  lsr #3
+	subhs	\dividend, \dividend, \divisor, lsr #3
+	cmp	\dividend, #1
+	mov	\divisor, \divisor, lsr #4
+	subges	\order, \order, #4
+	bge	1b
+
+	tst	\order, #3
+	teqne	\dividend, #0
+	beq	5f
+
+	@ Either 1, 2 or 3 comparison/substractions are left.
+2:	cmn	\order, #2
+	blt	4f
+	beq	3f
+	cmp	\dividend, \divisor
+	subhs	\dividend, \dividend, \divisor
+	mov	\divisor,  \divisor,  lsr #1
+3:	cmp	\dividend, \divisor
+	subhs	\dividend, \dividend, \divisor
+	mov	\divisor,  \divisor,  lsr #1
+4:	cmp	\dividend, \divisor
+	subhs	\dividend, \dividend, \divisor
+5:
+.endm
+
+
+ENTRY(__udivsi3)
+ENTRY(__aeabi_uidiv)
+UNWIND(.fnstart)
+
+	subs	r2, r1, #1
+	moveq	pc, lr
+	bcc	Ldiv0
+	cmp	r0, r1
+	bls	11f
+	tst	r1, r2
+	beq	12f
+
+	ARM_DIV_BODY r0, r1, r2, r3
+
+	mov	r0, r2
+	mov	pc, lr
+
+11:	moveq	r0, #1
+	movne	r0, #0
+	mov	pc, lr
+
+12:	ARM_DIV2_ORDER r1, r2
+
+	mov	r0, r0, lsr r2
+	mov	pc, lr
+
+UNWIND(.fnend)
+ENDPROC(__udivsi3)
+ENDPROC(__aeabi_uidiv)
+
+ENTRY(__umodsi3)
+UNWIND(.fnstart)
+
+	subs	r2, r1, #1			@ compare divisor with 1
+	bcc	Ldiv0
+	cmpne	r0, r1				@ compare dividend with divisor
+	moveq   r0, #0
+	tsthi	r1, r2				@ see if divisor is power of 2
+	andeq	r0, r0, r2
+	movls	pc, lr
+
+	ARM_MOD_BODY r0, r1, r2, r3
+
+	mov	pc, lr
+
+UNWIND(.fnend)
+ENDPROC(__umodsi3)
+
+ENTRY(__divsi3)
+ENTRY(__aeabi_idiv)
+UNWIND(.fnstart)
+
+	cmp	r1, #0
+	eor	ip, r0, r1			@ save the sign of the result.
+	beq	Ldiv0
+	rsbmi	r1, r1, #0			@ loops below use unsigned.
+	subs	r2, r1, #1			@ division by 1 or -1 ?
+	beq	10f
+	movs	r3, r0
+	rsbmi	r3, r0, #0			@ positive dividend value
+	cmp	r3, r1
+	bls	11f
+	tst	r1, r2				@ divisor is power of 2 ?
+	beq	12f
+
+	ARM_DIV_BODY r3, r1, r0, r2
+
+	cmp	ip, #0
+	rsbmi	r0, r0, #0
+	mov	pc, lr
+
+10:	teq	ip, r0				@ same sign ?
+	rsbmi	r0, r0, #0
+	mov	pc, lr
+
+11:	movlo	r0, #0
+	moveq	r0, ip, asr #31
+	orreq	r0, r0, #1
+	mov	pc, lr
+
+12:	ARM_DIV2_ORDER r1, r2
+
+	cmp	ip, #0
+	mov	r0, r3, lsr r2
+	rsbmi	r0, r0, #0
+	mov	pc, lr
+
+UNWIND(.fnend)
+ENDPROC(__divsi3)
+ENDPROC(__aeabi_idiv)
+
+ENTRY(__modsi3)
+UNWIND(.fnstart)
+
+	cmp	r1, #0
+	beq	Ldiv0
+	rsbmi	r1, r1, #0			@ loops below use unsigned.
+	movs	ip, r0				@ preserve sign of dividend
+	rsbmi	r0, r0, #0			@ if negative make positive
+	subs	r2, r1, #1			@ compare divisor with 1
+	cmpne	r0, r1				@ compare dividend with divisor
+	moveq	r0, #0
+	tsthi	r1, r2				@ see if divisor is power of 2
+	andeq	r0, r0, r2
+	bls	10f
+
+	ARM_MOD_BODY r0, r1, r2, r3
+
+10:	cmp	ip, #0
+	rsbmi	r0, r0, #0
+	mov	pc, lr
+
+UNWIND(.fnend)
+ENDPROC(__modsi3)
+
+#ifdef CONFIG_AEABI
+
+ENTRY(__aeabi_uidivmod)
+UNWIND(.fnstart)
+UNWIND(.save {r0, r1, ip, lr}	)
+
+	stmfd	sp!, {r0, r1, ip, lr}
+	bl	__aeabi_uidiv
+	ldmfd	sp!, {r1, r2, ip, lr}
+	mul	r3, r0, r2
+	sub	r1, r1, r3
+	mov	pc, lr
+
+UNWIND(.fnend)
+ENDPROC(__aeabi_uidivmod)
+
+ENTRY(__aeabi_idivmod)
+UNWIND(.fnstart)
+UNWIND(.save {r0, r1, ip, lr}	)
+	stmfd	sp!, {r0, r1, ip, lr}
+	bl	__aeabi_idiv
+	ldmfd	sp!, {r1, r2, ip, lr}
+	mul	r3, r0, r2
+	sub	r1, r1, r3
+	mov	pc, lr
+
+UNWIND(.fnend)
+ENDPROC(__aeabi_idivmod)
+
+#endif
+
+Ldiv0:
+UNWIND(.fnstart)
+UNWIND(.pad #4)
+UNWIND(.save {lr})
+	str	lr, [sp, #-8]!
+	bl	__div0
+	mov	r0, #0			@ About as wrong as it could be.
+	ldr	pc, [sp], #8
+UNWIND(.fnend)
+ENDPROC(Ldiv0)
--- a/arch/arm/lib/lshrdi3.S
+++ b/arch/arm/lib/lshrdi3.S
@@ -0,0 +1,53 @@
+/* Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
+   Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+In addition to the permissions in the GNU General Public License, the
+Free Software Foundation gives you unlimited permission to link the
+compiled version of this file into combinations with other programs,
+and to distribute those combinations without any restriction coming
+from the use of this file.  (The General Public License restrictions
+do apply in other respects; for example, they cover modification of
+the file, and distribution when not linked into a combine
+executable.)
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; see the file COPYING.  If not, write to
+the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+
+#include <linux/linkage.h>
+
+#ifdef __ARMEB__
+#define al r1
+#define ah r0
+#else
+#define al r0
+#define ah r1
+#endif
+
+ENTRY(__lshrdi3)
+ENTRY(__aeabi_llsr)
+
+	subs	r3, r2, #32
+	rsb	ip, r2, #32
+	movmi	al, al, lsr r2
+	movpl	al, ah, lsr r3
+ ARM(	orrmi	al, al, ah, lsl ip	)
+ THUMB(	lslmi	r3, ah, ip		)
+ THUMB(	orrmi	al, al, r3		)
+	mov	ah, ah, lsr r2
+	mov	pc, lr
+
+ENDPROC(__lshrdi3)
+ENDPROC(__aeabi_llsr)
--- a/arch/arm/lib/memchr.S
+++ b/arch/arm/lib/memchr.S
@@ -0,0 +1,26 @@
+/*
+ *  linux/arch/arm/lib/memchr.S
+ *
+ *  Copyright (C) 1995-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  ASM optimised string functions
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+	.align	5
+ENTRY(memchr)
+1:	subs	r2, r2, #1
+	bmi	2f
+	ldrb	r3, [r0], #1
+	teq	r3, r1
+	bne	1b
+	sub	r0, r0, #1
+2:	movne	r0, #0
+	mov	pc, lr
+ENDPROC(memchr)
--- a/arch/arm/lib/memcpy.S
+++ b/arch/arm/lib/memcpy.S
@@ -0,0 +1,63 @@
+/*
+ *  linux/arch/arm/lib/memcpy.S
+ *
+ *  Author:	Nicolas Pitre
+ *  Created:	Sep 28, 2005
+ *  Copyright:	MontaVista Software, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+#define LDR1W_SHIFT	0
+#define STR1W_SHIFT	0
+
+	.macro ldr1w ptr reg abort
+	W(ldr) \reg, [\ptr], #4
+	.endm
+
+	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
+	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4}
+	.endm
+
+	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+	.endm
+
+	.macro ldr1b ptr reg cond=al abort
+	ldr\cond\()b \reg, [\ptr], #1
+	.endm
+
+	.macro str1w ptr reg abort
+	W(str) \reg, [\ptr], #4
+	.endm
+
+	.macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+	stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+	.endm
+
+	.macro str1b ptr reg cond=al abort
+	str\cond\()b \reg, [\ptr], #1
+	.endm
+
+	.macro enter reg1 reg2
+	stmdb sp!, {r0, \reg1, \reg2}
+	.endm
+
+	.macro exit reg1 reg2
+	ldmfd sp!, {r0, \reg1, \reg2}
+	.endm
+
+	.text
+
+/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
+
+ENTRY(memcpy)
+
+#include "copy_template.S"
+
+ENDPROC(memcpy)
--- a/arch/arm/lib/memmove.S
+++ b/arch/arm/lib/memmove.S
@@ -0,0 +1,199 @@
+/*
+ *  linux/arch/arm/lib/memmove.S
+ *
+ *  Author:	Nicolas Pitre
+ *  Created:	Sep 28, 2005
+ *  Copyright:	(C) MontaVista Software Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+		.text
+
+/*
+ * Prototype: void *memmove(void *dest, const void *src, size_t n);
+ *
+ * Note:
+ *
+ * If the memory regions don't overlap, we simply branch to memcpy which is
+ * normally a bit faster. Otherwise the copy is done going downwards.  This
+ * is a transposition of the code from copy_template.S but with the copy
+ * occurring in the opposite direction.
+ */
+
+ENTRY(memmove)
+
+		subs	ip, r0, r1
+		cmphi	r2, ip
+		bls	memcpy
+
+		stmfd	sp!, {r0, r4, lr}
+		add	r1, r1, r2
+		add	r0, r0, r2
+		subs	r2, r2, #4
+		blt	8f
+		ands	ip, r0, #3
+	PLD(	pld	[r1, #-4]		)
+		bne	9f
+		ands	ip, r1, #3
+		bne	10f
+
+1:		subs	r2, r2, #(28)
+		stmfd	sp!, {r5 - r8}
+		blt	5f
+
+	CALGN(	ands	ip, r0, #31		)
+	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
+	CALGN(	bcs	2f			)
+	CALGN(	adr	r4, 6f			)
+	CALGN(	subs	r2, r2, ip		)  @ C is set here
+	CALGN(	rsb	ip, ip, #32		)
+	CALGN(	add	pc, r4, ip		)
+
+	PLD(	pld	[r1, #-4]		)
+2:	PLD(	subs	r2, r2, #96		)
+	PLD(	pld	[r1, #-32]		)
+	PLD(	blt	4f			)
+	PLD(	pld	[r1, #-64]		)
+	PLD(	pld	[r1, #-96]		)
+
+3:	PLD(	pld	[r1, #-128]		)
+4:		ldmdb	r1!, {r3, r4, r5, r6, r7, r8, ip, lr}
+		subs	r2, r2, #32
+		stmdb	r0!, {r3, r4, r5, r6, r7, r8, ip, lr}
+		bge	3b
+	PLD(	cmn	r2, #96			)
+	PLD(	bge	4b			)
+
+5:		ands	ip, r2, #28
+		rsb	ip, ip, #32
+		addne	pc, pc, ip		@ C is always clear here
+		b	7f
+6:		W(nop)
+		W(ldr)	r3, [r1, #-4]!
+		W(ldr)	r4, [r1, #-4]!
+		W(ldr)	r5, [r1, #-4]!
+		W(ldr)	r6, [r1, #-4]!
+		W(ldr)	r7, [r1, #-4]!
+		W(ldr)	r8, [r1, #-4]!
+		W(ldr)	lr, [r1, #-4]!
+
+		add	pc, pc, ip
+		nop
+		W(nop)
+		W(str)	r3, [r0, #-4]!
+		W(str)	r4, [r0, #-4]!
+		W(str)	r5, [r0, #-4]!
+		W(str)	r6, [r0, #-4]!
+		W(str)	r7, [r0, #-4]!
+		W(str)	r8, [r0, #-4]!
+		W(str)	lr, [r0, #-4]!
+
+	CALGN(	bcs	2b			)
+
+7:		ldmfd	sp!, {r5 - r8}
+
+8:		movs	r2, r2, lsl #31
+		ldrneb	r3, [r1, #-1]!
+		ldrcsb	r4, [r1, #-1]!
+		ldrcsb	ip, [r1, #-1]
+		strneb	r3, [r0, #-1]!
+		strcsb	r4, [r0, #-1]!
+		strcsb	ip, [r0, #-1]
+		ldmfd	sp!, {r0, r4, pc}
+
+9:		cmp	ip, #2
+		ldrgtb	r3, [r1, #-1]!
+		ldrgeb	r4, [r1, #-1]!
+		ldrb	lr, [r1, #-1]!
+		strgtb	r3, [r0, #-1]!
+		strgeb	r4, [r0, #-1]!
+		subs	r2, r2, ip
+		strb	lr, [r0, #-1]!
+		blt	8b
+		ands	ip, r1, #3
+		beq	1b
+
+10:		bic	r1, r1, #3
+		cmp	ip, #2
+		ldr	r3, [r1, #0]
+		beq	17f
+		blt	18f
+
+
+		.macro	backward_copy_shift push pull
+
+		subs	r2, r2, #28
+		blt	14f
+
+	CALGN(	ands	ip, r0, #31		)
+	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
+	CALGN(	subcc	r2, r2, ip		)
+	CALGN(	bcc	15f			)
+
+11:		stmfd	sp!, {r5 - r9}
+
+	PLD(	pld	[r1, #-4]		)
+	PLD(	subs	r2, r2, #96		)
+	PLD(	pld	[r1, #-32]		)
+	PLD(	blt	13f			)
+	PLD(	pld	[r1, #-64]		)
+	PLD(	pld	[r1, #-96]		)
+
+12:	PLD(	pld	[r1, #-128]		)
+13:		ldmdb   r1!, {r7, r8, r9, ip}
+		mov     lr, r3, push #\push
+		subs    r2, r2, #32
+		ldmdb   r1!, {r3, r4, r5, r6}
+		orr     lr, lr, ip, pull #\pull
+		mov     ip, ip, push #\push
+		orr     ip, ip, r9, pull #\pull
+		mov     r9, r9, push #\push
+		orr     r9, r9, r8, pull #\pull
+		mov     r8, r8, push #\push
+		orr     r8, r8, r7, pull #\pull
+		mov     r7, r7, push #\push
+		orr     r7, r7, r6, pull #\pull
+		mov     r6, r6, push #\push
+		orr     r6, r6, r5, pull #\pull
+		mov     r5, r5, push #\push
+		orr     r5, r5, r4, pull #\pull
+		mov     r4, r4, push #\push
+		orr     r4, r4, r3, pull #\pull
+		stmdb   r0!, {r4 - r9, ip, lr}
+		bge	12b
+	PLD(	cmn	r2, #96			)
+	PLD(	bge	13b			)
+
+		ldmfd	sp!, {r5 - r9}
+
+14:		ands	ip, r2, #28
+		beq	16f
+
+15:		mov     lr, r3, push #\push
+		ldr	r3, [r1, #-4]!
+		subs	ip, ip, #4
+		orr	lr, lr, r3, pull #\pull
+		str	lr, [r0, #-4]!
+		bgt	15b
+	CALGN(	cmp	r2, #0			)
+	CALGN(	bge	11b			)
+
+16:		add	r1, r1, #(\pull / 8)
+		b	8b
+
+		.endm
+
+
+		backward_copy_shift	push=8	pull=24
+
+17:		backward_copy_shift	push=16	pull=16
+
+18:		backward_copy_shift	push=24	pull=8
+
+ENDPROC(memmove)
--- a/arch/arm/lib/memset.S
+++ b/arch/arm/lib/memset.S
@@ -0,0 +1,123 @@
+/*
+ *  linux/arch/arm/lib/memset.S
+ *
+ *  Copyright (C) 1995-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  ASM optimised string functions
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+	.align	5
+
+ENTRY(memset)
+	ands	r3, r0, #3		@ 1 unaligned?
+	mov	ip, r0			@ preserve r0 as return value
+	bne	6f			@ 1
+/*
+ * we know that the pointer in ip is aligned to a word boundary.
+ */
+1:	orr	r1, r1, r1, lsl #8
+	orr	r1, r1, r1, lsl #16
+	mov	r3, r1
+	cmp	r2, #16
+	blt	4f
+
+#if ! CALGN(1)+0
+
+/*
+ * We need 2 extra registers for this loop - use r8 and the LR
+ */
+	stmfd	sp!, {r8, lr}
+	mov	r8, r1
+	mov	lr, r1
+
+2:	subs	r2, r2, #64
+	stmgeia	ip!, {r1, r3, r8, lr}	@ 64 bytes at a time.
+	stmgeia	ip!, {r1, r3, r8, lr}
+	stmgeia	ip!, {r1, r3, r8, lr}
+	stmgeia	ip!, {r1, r3, r8, lr}
+	bgt	2b
+	ldmeqfd	sp!, {r8, pc}		@ Now <64 bytes to go.
+/*
+ * No need to correct the count; we're only testing bits from now on
+ */
+	tst	r2, #32
+	stmneia	ip!, {r1, r3, r8, lr}
+	stmneia	ip!, {r1, r3, r8, lr}
+	tst	r2, #16
+	stmneia	ip!, {r1, r3, r8, lr}
+	ldmfd	sp!, {r8, lr}
+
+#else
+
+/*
+ * This version aligns the destination pointer in order to write
+ * whole cache lines at once.
+ */
+
+	stmfd	sp!, {r4-r8, lr}
+	mov	r4, r1
+	mov	r5, r1
+	mov	r6, r1
+	mov	r7, r1
+	mov	r8, r1
+	mov	lr, r1
+
+	cmp	r2, #96
+	tstgt	ip, #31
+	ble	3f
+
+	and	r8, ip, #31
+	rsb	r8, r8, #32
+	sub	r2, r2, r8
+	movs	r8, r8, lsl #(32 - 4)
+	stmcsia	ip!, {r4, r5, r6, r7}
+	stmmiia	ip!, {r4, r5}
+	tst	r8, #(1 << 30)
+	mov	r8, r1
+	strne	r1, [ip], #4
+
+3:	subs	r2, r2, #64
+	stmgeia	ip!, {r1, r3-r8, lr}
+	stmgeia	ip!, {r1, r3-r8, lr}
+	bgt	3b
+	ldmeqfd	sp!, {r4-r8, pc}
+
+	tst	r2, #32
+	stmneia	ip!, {r1, r3-r8, lr}
+	tst	r2, #16
+	stmneia	ip!, {r4-r7}
+	ldmfd	sp!, {r4-r8, lr}
+
+#endif
+
+4:	tst	r2, #8
+	stmneia	ip!, {r1, r3}
+	tst	r2, #4
+	strne	r1, [ip], #4
+/*
+ * When we get here, we've got less than 4 bytes to zero.  We
+ * may have an unaligned pointer as well.
+ */
+5:	tst	r2, #2
+	strneb	r1, [ip], #1
+	strneb	r1, [ip], #1
+	tst	r2, #1
+	strneb	r1, [ip], #1
+	mov	pc, lr
+
+6:	subs	r2, r2, #4		@ 1 do we have enough
+	blt	5b			@ 1 bytes to align with?
+	cmp	r3, #2			@ 1
+	strltb	r1, [ip], #1		@ 1
+	strleb	r1, [ip], #1		@ 1
+	strb	r1, [ip], #1		@ 1
+	add	r2, r2, r3		@ 1 (r2 = r2 - (4 - r3))
+	b	1b
+ENDPROC(memset)
--- a/arch/arm/lib/memzero.S
+++ b/arch/arm/lib/memzero.S
@@ -0,0 +1,125 @@
+/*
+ *  linux/arch/arm/lib/memzero.S
+ *
+ *  Copyright (C) 1995-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+	.align	5
+	.word	0
+/*
+ * Align the pointer in r0.  r3 contains the number of bytes that we are
+ * mis-aligned by, and r1 is the number of bytes.  If r1 < 4, then we
+ * don't bother; we use byte stores instead.
+ */
+1:	subs	r1, r1, #4		@ 1 do we have enough
+	blt	5f			@ 1 bytes to align with?
+	cmp	r3, #2			@ 1
+	strltb	r2, [r0], #1		@ 1
+	strleb	r2, [r0], #1		@ 1
+	strb	r2, [r0], #1		@ 1
+	add	r1, r1, r3		@ 1 (r1 = r1 - (4 - r3))
+/*
+ * The pointer is now aligned and the length is adjusted.  Try doing the
+ * memzero again.
+ */
+
+ENTRY(__memzero)
+	mov	r2, #0			@ 1
+	ands	r3, r0, #3		@ 1 unaligned?
+	bne	1b			@ 1
+/*
+ * r3 = 0, and we know that the pointer in r0 is aligned to a word boundary.
+ */
+	cmp	r1, #16			@ 1 we can skip this chunk if we
+	blt	4f			@ 1 have < 16 bytes
+
+#if ! CALGN(1)+0
+
+/*
+ * We need an extra register for this loop - save the return address and
+ * use the LR
+ */
+	str	lr, [sp, #-4]!		@ 1
+	mov	ip, r2			@ 1
+	mov	lr, r2			@ 1
+
+3:	subs	r1, r1, #64		@ 1 write 32 bytes out per loop
+	stmgeia	r0!, {r2, r3, ip, lr}	@ 4
+	stmgeia	r0!, {r2, r3, ip, lr}	@ 4
+	stmgeia	r0!, {r2, r3, ip, lr}	@ 4
+	stmgeia	r0!, {r2, r3, ip, lr}	@ 4
+	bgt	3b			@ 1
+	ldmeqfd	sp!, {pc}		@ 1/2 quick exit
+/*
+ * No need to correct the count; we're only testing bits from now on
+ */
+	tst	r1, #32			@ 1
+	stmneia	r0!, {r2, r3, ip, lr}	@ 4
+	stmneia	r0!, {r2, r3, ip, lr}	@ 4
+	tst	r1, #16			@ 1 16 bytes or more?
+	stmneia	r0!, {r2, r3, ip, lr}	@ 4
+	ldr	lr, [sp], #4		@ 1
+
+#else
+
+/*
+ * This version aligns the destination pointer in order to write
+ * whole cache lines at once.
+ */
+
+	stmfd	sp!, {r4-r7, lr}
+	mov	r4, r2
+	mov	r5, r2
+	mov	r6, r2
+	mov	r7, r2
+	mov	ip, r2
+	mov	lr, r2
+
+	cmp	r1, #96
+	andgts	ip, r0, #31
+	ble	3f
+
+	rsb	ip, ip, #32
+	sub	r1, r1, ip
+	movs	ip, ip, lsl #(32 - 4)
+	stmcsia	r0!, {r4, r5, r6, r7}
+	stmmiia	r0!, {r4, r5}
+	movs	ip, ip, lsl #2
+	strcs	r2, [r0], #4
+
+3:	subs	r1, r1, #64
+	stmgeia	r0!, {r2-r7, ip, lr}
+	stmgeia	r0!, {r2-r7, ip, lr}
+	bgt	3b
+	ldmeqfd	sp!, {r4-r7, pc}
+
+	tst	r1, #32
+	stmneia	r0!, {r2-r7, ip, lr}
+	tst	r1, #16
+	stmneia	r0!, {r4-r7}
+	ldmfd	sp!, {r4-r7, lr}
+
+#endif
+
+4:	tst	r1, #8			@ 1 8 bytes or more?
+	stmneia	r0!, {r2, r3}		@ 2
+	tst	r1, #4			@ 1 4 bytes or more?
+	strne	r2, [r0], #4		@ 1
+/*
+ * When we get here, we've got less than 4 bytes to zero.  We
+ * may have an unaligned pointer as well.
+ */
+5:	tst	r1, #2			@ 1 2 bytes or more?
+	strneb	r2, [r0], #1		@ 1
+	strneb	r2, [r0], #1		@ 1
+	tst	r1, #1			@ 1 a byte left over
+	strneb	r2, [r0], #1		@ 1
+	mov	pc, lr			@ 1
+ENDPROC(__memzero)
--- a/arch/arm/lib/muldi3.S
+++ b/arch/arm/lib/muldi3.S
@@ -0,0 +1,47 @@
+/*
+ *  linux/arch/arm/lib/muldi3.S
+ *
+ *  Author:     Nicolas Pitre
+ *  Created:    Oct 19, 2005
+ *  Copyright:  Monta Vista Software, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+
+#ifdef __ARMEB__
+#define xh r0
+#define xl r1
+#define yh r2
+#define yl r3
+#else
+#define xl r0
+#define xh r1
+#define yl r2
+#define yh r3
+#endif
+
+ENTRY(__muldi3)
+ENTRY(__aeabi_lmul)
+
+	mul	xh, yl, xh
+	mla	xh, xl, yh, xh
+	mov	ip, xl, lsr #16
+	mov	yh, yl, lsr #16
+	bic	xl, xl, ip, lsl #16
+	bic	yl, yl, yh, lsl #16
+	mla	xh, yh, ip, xh
+	mul	yh, xl, yh
+	mul	xl, yl, xl
+	mul	ip, yl, ip
+	adds	xl, xl, yh, lsl #16
+	adc	xh, xh, yh, lsr #16
+	adds	xl, xl, ip, lsl #16
+	adc	xh, xh, ip, lsr #16
+	mov	pc, lr
+
+ENDPROC(__muldi3)
+ENDPROC(__aeabi_lmul)
--- a/arch/arm/lib/putuser.S
+++ b/arch/arm/lib/putuser.S
@@ -0,0 +1,98 @@
+/*
+ *  linux/arch/arm/lib/putuser.S
+ *
+ *  Copyright (C) 2001 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  Idea from x86 version, (C) Copyright 1998 Linus Torvalds
+ *
+ * These functions have a non-standard call interface to make
+ * them more efficient, especially as they return an error
+ * value in addition to the "real" return value.
+ *
+ * __put_user_X
+ *
+ * Inputs:	r0 contains the address
+ *		r1 contains the address limit, which must be preserved
+ *		r2, r3 contains the value
+ * Outputs:	r0 is the error code
+ *		lr corrupted
+ *
+ * No other registers must be altered.  (see <asm/uaccess.h>
+ * for specific ASM register usage).
+ *
+ * Note that ADDR_LIMIT is either 0 or 0xc0000000
+ * Note also that it is intended that __put_user_bad is not global.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/errno.h>
+#include <asm/domain.h>
+
+ENTRY(__put_user_1)
+	check_uaccess r0, 1, r1, ip, __put_user_bad
+1: TUSER(strb)	r2, [r0]
+	mov	r0, #0
+	mov	pc, lr
+ENDPROC(__put_user_1)
+
+ENTRY(__put_user_2)
+	check_uaccess r0, 2, r1, ip, __put_user_bad
+	mov	ip, r2, lsr #8
+#ifdef CONFIG_THUMB2_KERNEL
+#ifndef __ARMEB__
+2: TUSER(strb)	r2, [r0]
+3: TUSER(strb)	ip, [r0, #1]
+#else
+2: TUSER(strb)	ip, [r0]
+3: TUSER(strb)	r2, [r0, #1]
+#endif
+#else	/* !CONFIG_THUMB2_KERNEL */
+#ifndef __ARMEB__
+2: TUSER(strb)	r2, [r0], #1
+3: TUSER(strb)	ip, [r0]
+#else
+2: TUSER(strb)	ip, [r0], #1
+3: TUSER(strb)	r2, [r0]
+#endif
+#endif	/* CONFIG_THUMB2_KERNEL */
+	mov	r0, #0
+	mov	pc, lr
+ENDPROC(__put_user_2)
+
+ENTRY(__put_user_4)
+	check_uaccess r0, 4, r1, ip, __put_user_bad
+4: TUSER(str)	r2, [r0]
+	mov	r0, #0
+	mov	pc, lr
+ENDPROC(__put_user_4)
+
+ENTRY(__put_user_8)
+	check_uaccess r0, 8, r1, ip, __put_user_bad
+#ifdef CONFIG_THUMB2_KERNEL
+5: TUSER(str)	r2, [r0]
+6: TUSER(str)	r3, [r0, #4]
+#else
+5: TUSER(str)	r2, [r0], #4
+6: TUSER(str)	r3, [r0]
+#endif
+	mov	r0, #0
+	mov	pc, lr
+ENDPROC(__put_user_8)
+
+__put_user_bad:
+	mov	r0, #-EFAULT
+	mov	pc, lr
+ENDPROC(__put_user_bad)
+
+.pushsection __ex_table, "a"
+	.long	1b, __put_user_bad
+	.long	2b, __put_user_bad
+	.long	3b, __put_user_bad
+	.long	4b, __put_user_bad
+	.long	5b, __put_user_bad
+	.long	6b, __put_user_bad
+.popsection
--- a/arch/arm/lib/setbit.S
+++ b/arch/arm/lib/setbit.S
@@ -0,0 +1,15 @@
+/*
+ *  linux/arch/arm/lib/setbit.S
+ *
+ *  Copyright (C) 1995-1996 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include "bitops.h"
+		.text
+
+bitop	_set_bit, orr
--- a/arch/arm/lib/strchr.S
+++ b/arch/arm/lib/strchr.S
@@ -0,0 +1,27 @@
+/*
+ *  linux/arch/arm/lib/strchr.S
+ *
+ *  Copyright (C) 1995-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  ASM optimised string functions
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+		.text
+		.align	5
+ENTRY(strchr)
+		and	r1, r1, #0xff
+1:		ldrb	r2, [r0], #1
+		teq	r2, r1
+		teqne	r2, #0
+		bne	1b
+		teq	r2, r1
+		movne	r0, #0
+		subeq	r0, r0, #1
+		mov	pc, lr
+ENDPROC(strchr)
--- a/arch/arm/lib/strrchr.S
+++ b/arch/arm/lib/strrchr.S
@@ -0,0 +1,26 @@
+/*
+ *  linux/arch/arm/lib/strrchr.S
+ *
+ *  Copyright (C) 1995-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  ASM optimised string functions
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+		.text
+		.align	5
+ENTRY(strrchr)
+		mov	r3, #0
+1:		ldrb	r2, [r0], #1
+		teq	r2, r1
+		subeq	r3, r0, #1
+		teq	r2, #0
+		bne	1b
+		mov	r0, r3
+		mov	pc, lr
+ENDPROC(strrchr)
--- a/arch/arm/lib/testchangebit.S
+++ b/arch/arm/lib/testchangebit.S
@@ -0,0 +1,15 @@
+/*
+ *  linux/arch/arm/lib/testchangebit.S
+ *
+ *  Copyright (C) 1995-1996 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include "bitops.h"
+                .text
+
+testop	_test_and_change_bit, eor, str
--- a/arch/arm/lib/testclearbit.S
+++ b/arch/arm/lib/testclearbit.S
@@ -0,0 +1,15 @@
+/*
+ *  linux/arch/arm/lib/testclearbit.S
+ *
+ *  Copyright (C) 1995-1996 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include "bitops.h"
+                .text
+
+testop	_test_and_clear_bit, bicne, strne
--- a/arch/arm/lib/testsetbit.S
+++ b/arch/arm/lib/testsetbit.S
@@ -0,0 +1,15 @@
+/*
+ *  linux/arch/arm/lib/testsetbit.S
+ *
+ *  Copyright (C) 1995-1996 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include "bitops.h"
+                .text
+
+testop	_test_and_set_bit, orreq, streq
--- a/arch/arm/lib/uaccess.S
+++ b/arch/arm/lib/uaccess.S
@@ -0,0 +1,564 @@
+/*
+ *  linux/arch/arm/lib/uaccess.S
+ *
+ *  Copyright (C) 1995, 1996,1997,1998 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  Routines to block copy data to/from user memory
+ *   These are highly optimised both for the 4k page size
+ *   and for various alignments.
+ */
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+#include <asm/errno.h>
+#include <asm/domain.h>
+
+		.text
+
+#define PAGE_SHIFT 12
+
+/* Prototype: int __copy_to_user(void *to, const char *from, size_t n)
+ * Purpose  : copy a block to user memory from kernel memory
+ * Params   : to   - user memory
+ *          : from - kernel memory
+ *          : n    - number of bytes to copy
+ * Returns  : Number of bytes NOT copied.
+ */
+
+.Lc2u_dest_not_aligned:
+		rsb	ip, ip, #4
+		cmp	ip, #2
+		ldrb	r3, [r1], #1
+USER(	TUSER(	strb)	r3, [r0], #1)			@ May fault
+		ldrgeb	r3, [r1], #1
+USER(	TUSER(	strgeb) r3, [r0], #1)			@ May fault
+		ldrgtb	r3, [r1], #1
+USER(	TUSER(	strgtb) r3, [r0], #1)			@ May fault
+		sub	r2, r2, ip
+		b	.Lc2u_dest_aligned
+
+ENTRY(__copy_to_user)
+		stmfd	sp!, {r2, r4 - r7, lr}
+		cmp	r2, #4
+		blt	.Lc2u_not_enough
+		ands	ip, r0, #3
+		bne	.Lc2u_dest_not_aligned
+.Lc2u_dest_aligned:
+
+		ands	ip, r1, #3
+		bne	.Lc2u_src_not_aligned
+/*
+ * Seeing as there has to be at least 8 bytes to copy, we can
+ * copy one word, and force a user-mode page fault...
+ */
+
+.Lc2u_0fupi:	subs	r2, r2, #4
+		addmi	ip, r2, #4
+		bmi	.Lc2u_0nowords
+		ldr	r3, [r1], #4
+USER(	TUSER(	str)	r3, [r0], #4)			@ May fault
+		mov	ip, r0, lsl #32 - PAGE_SHIFT	@ On each page, use a ld/st??t instruction
+		rsb	ip, ip, #0
+		movs	ip, ip, lsr #32 - PAGE_SHIFT
+		beq	.Lc2u_0fupi
+/*
+ * ip = max no. of bytes to copy before needing another "strt" insn
+ */
+		cmp	r2, ip
+		movlt	ip, r2
+		sub	r2, r2, ip
+		subs	ip, ip, #32
+		blt	.Lc2u_0rem8lp
+
+.Lc2u_0cpy8lp:	ldmia	r1!, {r3 - r6}
+		stmia	r0!, {r3 - r6}			@ Shouldnt fault
+		ldmia	r1!, {r3 - r6}
+		subs	ip, ip, #32
+		stmia	r0!, {r3 - r6}			@ Shouldnt fault
+		bpl	.Lc2u_0cpy8lp
+
+.Lc2u_0rem8lp:	cmn	ip, #16
+		ldmgeia	r1!, {r3 - r6}
+		stmgeia	r0!, {r3 - r6}			@ Shouldnt fault
+		tst	ip, #8
+		ldmneia	r1!, {r3 - r4}
+		stmneia	r0!, {r3 - r4}			@ Shouldnt fault
+		tst	ip, #4
+		ldrne	r3, [r1], #4
+	TUSER(	strne) r3, [r0], #4			@ Shouldnt fault
+		ands	ip, ip, #3
+		beq	.Lc2u_0fupi
+.Lc2u_0nowords:	teq	ip, #0
+		beq	.Lc2u_finished
+.Lc2u_nowords:	cmp	ip, #2
+		ldrb	r3, [r1], #1
+USER(	TUSER(	strb)	r3, [r0], #1)			@ May fault
+		ldrgeb	r3, [r1], #1
+USER(	TUSER(	strgeb) r3, [r0], #1)			@ May fault
+		ldrgtb	r3, [r1], #1
+USER(	TUSER(	strgtb) r3, [r0], #1)			@ May fault
+		b	.Lc2u_finished
+
+.Lc2u_not_enough:
+		movs	ip, r2
+		bne	.Lc2u_nowords
+.Lc2u_finished:	mov	r0, #0
+		ldmfd	sp!, {r2, r4 - r7, pc}
+
+.Lc2u_src_not_aligned:
+		bic	r1, r1, #3
+		ldr	r7, [r1], #4
+		cmp	ip, #2
+		bgt	.Lc2u_3fupi
+		beq	.Lc2u_2fupi
+.Lc2u_1fupi:	subs	r2, r2, #4
+		addmi	ip, r2, #4
+		bmi	.Lc2u_1nowords
+		mov	r3, r7, pull #8
+		ldr	r7, [r1], #4
+		orr	r3, r3, r7, push #24
+USER(	TUSER(	str)	r3, [r0], #4)			@ May fault
+		mov	ip, r0, lsl #32 - PAGE_SHIFT
+		rsb	ip, ip, #0
+		movs	ip, ip, lsr #32 - PAGE_SHIFT
+		beq	.Lc2u_1fupi
+		cmp	r2, ip
+		movlt	ip, r2
+		sub	r2, r2, ip
+		subs	ip, ip, #16
+		blt	.Lc2u_1rem8lp
+
+.Lc2u_1cpy8lp:	mov	r3, r7, pull #8
+		ldmia	r1!, {r4 - r7}
+		subs	ip, ip, #16
+		orr	r3, r3, r4, push #24
+		mov	r4, r4, pull #8
+		orr	r4, r4, r5, push #24
+		mov	r5, r5, pull #8
+		orr	r5, r5, r6, push #24
+		mov	r6, r6, pull #8
+		orr	r6, r6, r7, push #24
+		stmia	r0!, {r3 - r6}			@ Shouldnt fault
+		bpl	.Lc2u_1cpy8lp
+
+.Lc2u_1rem8lp:	tst	ip, #8
+		movne	r3, r7, pull #8
+		ldmneia	r1!, {r4, r7}
+		orrne	r3, r3, r4, push #24
+		movne	r4, r4, pull #8
+		orrne	r4, r4, r7, push #24
+		stmneia	r0!, {r3 - r4}			@ Shouldnt fault
+		tst	ip, #4
+		movne	r3, r7, pull #8
+		ldrne	r7, [r1], #4
+		orrne	r3, r3, r7, push #24
+	TUSER(	strne) r3, [r0], #4			@ Shouldnt fault
+		ands	ip, ip, #3
+		beq	.Lc2u_1fupi
+.Lc2u_1nowords:	mov	r3, r7, get_byte_1
+		teq	ip, #0
+		beq	.Lc2u_finished
+		cmp	ip, #2
+USER(	TUSER(	strb)	r3, [r0], #1)			@ May fault
+		movge	r3, r7, get_byte_2
+USER(	TUSER(	strgeb) r3, [r0], #1)			@ May fault
+		movgt	r3, r7, get_byte_3
+USER(	TUSER(	strgtb) r3, [r0], #1)			@ May fault
+		b	.Lc2u_finished
+
+.Lc2u_2fupi:	subs	r2, r2, #4
+		addmi	ip, r2, #4
+		bmi	.Lc2u_2nowords
+		mov	r3, r7, pull #16
+		ldr	r7, [r1], #4
+		orr	r3, r3, r7, push #16
+USER(	TUSER(	str)	r3, [r0], #4)			@ May fault
+		mov	ip, r0, lsl #32 - PAGE_SHIFT
+		rsb	ip, ip, #0
+		movs	ip, ip, lsr #32 - PAGE_SHIFT
+		beq	.Lc2u_2fupi
+		cmp	r2, ip
+		movlt	ip, r2
+		sub	r2, r2, ip
+		subs	ip, ip, #16
+		blt	.Lc2u_2rem8lp
+
+.Lc2u_2cpy8lp:	mov	r3, r7, pull #16
+		ldmia	r1!, {r4 - r7}
+		subs	ip, ip, #16
+		orr	r3, r3, r4, push #16
+		mov	r4, r4, pull #16
+		orr	r4, r4, r5, push #16
+		mov	r5, r5, pull #16
+		orr	r5, r5, r6, push #16
+		mov	r6, r6, pull #16
+		orr	r6, r6, r7, push #16
+		stmia	r0!, {r3 - r6}			@ Shouldnt fault
+		bpl	.Lc2u_2cpy8lp
+
+.Lc2u_2rem8lp:	tst	ip, #8
+		movne	r3, r7, pull #16
+		ldmneia	r1!, {r4, r7}
+		orrne	r3, r3, r4, push #16
+		movne	r4, r4, pull #16
+		orrne	r4, r4, r7, push #16
+		stmneia	r0!, {r3 - r4}			@ Shouldnt fault
+		tst	ip, #4
+		movne	r3, r7, pull #16
+		ldrne	r7, [r1], #4
+		orrne	r3, r3, r7, push #16
+	TUSER(	strne) r3, [r0], #4			@ Shouldnt fault
+		ands	ip, ip, #3
+		beq	.Lc2u_2fupi
+.Lc2u_2nowords:	mov	r3, r7, get_byte_2
+		teq	ip, #0
+		beq	.Lc2u_finished
+		cmp	ip, #2
+USER(	TUSER(	strb)	r3, [r0], #1)			@ May fault
+		movge	r3, r7, get_byte_3
+USER(	TUSER(	strgeb) r3, [r0], #1)			@ May fault
+		ldrgtb	r3, [r1], #0
+USER(	TUSER(	strgtb) r3, [r0], #1)			@ May fault
+		b	.Lc2u_finished
+
+.Lc2u_3fupi:	subs	r2, r2, #4
+		addmi	ip, r2, #4
+		bmi	.Lc2u_3nowords
+		mov	r3, r7, pull #24
+		ldr	r7, [r1], #4
+		orr	r3, r3, r7, push #8
+USER(	TUSER(	str)	r3, [r0], #4)			@ May fault
+		mov	ip, r0, lsl #32 - PAGE_SHIFT
+		rsb	ip, ip, #0
+		movs	ip, ip, lsr #32 - PAGE_SHIFT
+		beq	.Lc2u_3fupi
+		cmp	r2, ip
+		movlt	ip, r2
+		sub	r2, r2, ip
+		subs	ip, ip, #16
+		blt	.Lc2u_3rem8lp
+
+.Lc2u_3cpy8lp:	mov	r3, r7, pull #24
+		ldmia	r1!, {r4 - r7}
+		subs	ip, ip, #16
+		orr	r3, r3, r4, push #8
+		mov	r4, r4, pull #24
+		orr	r4, r4, r5, push #8
+		mov	r5, r5, pull #24
+		orr	r5, r5, r6, push #8
+		mov	r6, r6, pull #24
+		orr	r6, r6, r7, push #8
+		stmia	r0!, {r3 - r6}			@ Shouldnt fault
+		bpl	.Lc2u_3cpy8lp
+
+.Lc2u_3rem8lp:	tst	ip, #8
+		movne	r3, r7, pull #24
+		ldmneia	r1!, {r4, r7}
+		orrne	r3, r3, r4, push #8
+		movne	r4, r4, pull #24
+		orrne	r4, r4, r7, push #8
+		stmneia	r0!, {r3 - r4}			@ Shouldnt fault
+		tst	ip, #4
+		movne	r3, r7, pull #24
+		ldrne	r7, [r1], #4
+		orrne	r3, r3, r7, push #8
+	TUSER(	strne) r3, [r0], #4			@ Shouldnt fault
+		ands	ip, ip, #3
+		beq	.Lc2u_3fupi
+.Lc2u_3nowords:	mov	r3, r7, get_byte_3
+		teq	ip, #0
+		beq	.Lc2u_finished
+		cmp	ip, #2
+USER(	TUSER(	strb)	r3, [r0], #1)			@ May fault
+		ldrgeb	r3, [r1], #1
+USER(	TUSER(	strgeb) r3, [r0], #1)			@ May fault
+		ldrgtb	r3, [r1], #0
+USER(	TUSER(	strgtb) r3, [r0], #1)			@ May fault
+		b	.Lc2u_finished
+ENDPROC(__copy_to_user)
+
+		.pushsection .fixup,"ax"
+		.align	0
+9001:		ldmfd	sp!, {r0, r4 - r7, pc}
+		.popsection
+
+/* Prototype: unsigned long __copy_from_user(void *to,const void *from,unsigned long n);
+ * Purpose  : copy a block from user memory to kernel memory
+ * Params   : to   - kernel memory
+ *          : from - user memory
+ *          : n    - number of bytes to copy
+ * Returns  : Number of bytes NOT copied.
+ */
+.Lcfu_dest_not_aligned:
+		rsb	ip, ip, #4
+		cmp	ip, #2
+USER(	TUSER(	ldrb)	r3, [r1], #1)			@ May fault
+		strb	r3, [r0], #1
+USER(	TUSER(	ldrgeb) r3, [r1], #1)			@ May fault
+		strgeb	r3, [r0], #1
+USER(	TUSER(	ldrgtb) r3, [r1], #1)			@ May fault
+		strgtb	r3, [r0], #1
+		sub	r2, r2, ip
+		b	.Lcfu_dest_aligned
+
+ENTRY(__copy_from_user)
+		stmfd	sp!, {r0, r2, r4 - r7, lr}
+		cmp	r2, #4
+		blt	.Lcfu_not_enough
+		ands	ip, r0, #3
+		bne	.Lcfu_dest_not_aligned
+.Lcfu_dest_aligned:
+		ands	ip, r1, #3
+		bne	.Lcfu_src_not_aligned
+
+/*
+ * Seeing as there has to be at least 8 bytes to copy, we can
+ * copy one word, and force a user-mode page fault...
+ */
+
+.Lcfu_0fupi:	subs	r2, r2, #4
+		addmi	ip, r2, #4
+		bmi	.Lcfu_0nowords
+USER(	TUSER(	ldr)	r3, [r1], #4)
+		str	r3, [r0], #4
+		mov	ip, r1, lsl #32 - PAGE_SHIFT	@ On each page, use a ld/st??t instruction
+		rsb	ip, ip, #0
+		movs	ip, ip, lsr #32 - PAGE_SHIFT
+		beq	.Lcfu_0fupi
+/*
+ * ip = max no. of bytes to copy before needing another "strt" insn
+ */
+		cmp	r2, ip
+		movlt	ip, r2
+		sub	r2, r2, ip
+		subs	ip, ip, #32
+		blt	.Lcfu_0rem8lp
+
+.Lcfu_0cpy8lp:	ldmia	r1!, {r3 - r6}			@ Shouldnt fault
+		stmia	r0!, {r3 - r6}
+		ldmia	r1!, {r3 - r6}			@ Shouldnt fault
+		subs	ip, ip, #32
+		stmia	r0!, {r3 - r6}
+		bpl	.Lcfu_0cpy8lp
+
+.Lcfu_0rem8lp:	cmn	ip, #16
+		ldmgeia	r1!, {r3 - r6}			@ Shouldnt fault
+		stmgeia	r0!, {r3 - r6}
+		tst	ip, #8
+		ldmneia	r1!, {r3 - r4}			@ Shouldnt fault
+		stmneia	r0!, {r3 - r4}
+		tst	ip, #4
+	TUSER(	ldrne) r3, [r1], #4			@ Shouldnt fault
+		strne	r3, [r0], #4
+		ands	ip, ip, #3
+		beq	.Lcfu_0fupi
+.Lcfu_0nowords:	teq	ip, #0
+		beq	.Lcfu_finished
+.Lcfu_nowords:	cmp	ip, #2
+USER(	TUSER(	ldrb)	r3, [r1], #1)			@ May fault
+		strb	r3, [r0], #1
+USER(	TUSER(	ldrgeb) r3, [r1], #1)			@ May fault
+		strgeb	r3, [r0], #1
+USER(	TUSER(	ldrgtb) r3, [r1], #1)			@ May fault
+		strgtb	r3, [r0], #1
+		b	.Lcfu_finished
+
+.Lcfu_not_enough:
+		movs	ip, r2
+		bne	.Lcfu_nowords
+.Lcfu_finished:	mov	r0, #0
+		add	sp, sp, #8
+		ldmfd	sp!, {r4 - r7, pc}
+
+.Lcfu_src_not_aligned:
+		bic	r1, r1, #3
+USER(	TUSER(	ldr)	r7, [r1], #4)			@ May fault
+		cmp	ip, #2
+		bgt	.Lcfu_3fupi
+		beq	.Lcfu_2fupi
+.Lcfu_1fupi:	subs	r2, r2, #4
+		addmi	ip, r2, #4
+		bmi	.Lcfu_1nowords
+		mov	r3, r7, pull #8
+USER(	TUSER(	ldr)	r7, [r1], #4)			@ May fault
+		orr	r3, r3, r7, push #24
+		str	r3, [r0], #4
+		mov	ip, r1, lsl #32 - PAGE_SHIFT
+		rsb	ip, ip, #0
+		movs	ip, ip, lsr #32 - PAGE_SHIFT
+		beq	.Lcfu_1fupi
+		cmp	r2, ip
+		movlt	ip, r2
+		sub	r2, r2, ip
+		subs	ip, ip, #16
+		blt	.Lcfu_1rem8lp
+
+.Lcfu_1cpy8lp:	mov	r3, r7, pull #8
+		ldmia	r1!, {r4 - r7}			@ Shouldnt fault
+		subs	ip, ip, #16
+		orr	r3, r3, r4, push #24
+		mov	r4, r4, pull #8
+		orr	r4, r4, r5, push #24
+		mov	r5, r5, pull #8
+		orr	r5, r5, r6, push #24
+		mov	r6, r6, pull #8
+		orr	r6, r6, r7, push #24
+		stmia	r0!, {r3 - r6}
+		bpl	.Lcfu_1cpy8lp
+
+.Lcfu_1rem8lp:	tst	ip, #8
+		movne	r3, r7, pull #8
+		ldmneia	r1!, {r4, r7}			@ Shouldnt fault
+		orrne	r3, r3, r4, push #24
+		movne	r4, r4, pull #8
+		orrne	r4, r4, r7, push #24
+		stmneia	r0!, {r3 - r4}
+		tst	ip, #4
+		movne	r3, r7, pull #8
+USER(	TUSER(	ldrne) r7, [r1], #4)			@ May fault
+		orrne	r3, r3, r7, push #24
+		strne	r3, [r0], #4
+		ands	ip, ip, #3
+		beq	.Lcfu_1fupi
+.Lcfu_1nowords:	mov	r3, r7, get_byte_1
+		teq	ip, #0
+		beq	.Lcfu_finished
+		cmp	ip, #2
+		strb	r3, [r0], #1
+		movge	r3, r7, get_byte_2
+		strgeb	r3, [r0], #1
+		movgt	r3, r7, get_byte_3
+		strgtb	r3, [r0], #1
+		b	.Lcfu_finished
+
+.Lcfu_2fupi:	subs	r2, r2, #4
+		addmi	ip, r2, #4
+		bmi	.Lcfu_2nowords
+		mov	r3, r7, pull #16
+USER(	TUSER(	ldr)	r7, [r1], #4)			@ May fault
+		orr	r3, r3, r7, push #16
+		str	r3, [r0], #4
+		mov	ip, r1, lsl #32 - PAGE_SHIFT
+		rsb	ip, ip, #0
+		movs	ip, ip, lsr #32 - PAGE_SHIFT
+		beq	.Lcfu_2fupi
+		cmp	r2, ip
+		movlt	ip, r2
+		sub	r2, r2, ip
+		subs	ip, ip, #16
+		blt	.Lcfu_2rem8lp
+
+
+.Lcfu_2cpy8lp:	mov	r3, r7, pull #16
+		ldmia	r1!, {r4 - r7}			@ Shouldnt fault
+		subs	ip, ip, #16
+		orr	r3, r3, r4, push #16
+		mov	r4, r4, pull #16
+		orr	r4, r4, r5, push #16
+		mov	r5, r5, pull #16
+		orr	r5, r5, r6, push #16
+		mov	r6, r6, pull #16
+		orr	r6, r6, r7, push #16
+		stmia	r0!, {r3 - r6}
+		bpl	.Lcfu_2cpy8lp
+
+.Lcfu_2rem8lp:	tst	ip, #8
+		movne	r3, r7, pull #16
+		ldmneia	r1!, {r4, r7}			@ Shouldnt fault
+		orrne	r3, r3, r4, push #16
+		movne	r4, r4, pull #16
+		orrne	r4, r4, r7, push #16
+		stmneia	r0!, {r3 - r4}
+		tst	ip, #4
+		movne	r3, r7, pull #16
+USER(	TUSER(	ldrne) r7, [r1], #4)			@ May fault
+		orrne	r3, r3, r7, push #16
+		strne	r3, [r0], #4
+		ands	ip, ip, #3
+		beq	.Lcfu_2fupi
+.Lcfu_2nowords:	mov	r3, r7, get_byte_2
+		teq	ip, #0
+		beq	.Lcfu_finished
+		cmp	ip, #2
+		strb	r3, [r0], #1
+		movge	r3, r7, get_byte_3
+		strgeb	r3, [r0], #1
+USER(	TUSER(	ldrgtb) r3, [r1], #0)			@ May fault
+		strgtb	r3, [r0], #1
+		b	.Lcfu_finished
+
+.Lcfu_3fupi:	subs	r2, r2, #4
+		addmi	ip, r2, #4
+		bmi	.Lcfu_3nowords
+		mov	r3, r7, pull #24
+USER(	TUSER(	ldr)	r7, [r1], #4)			@ May fault
+		orr	r3, r3, r7, push #8
+		str	r3, [r0], #4
+		mov	ip, r1, lsl #32 - PAGE_SHIFT
+		rsb	ip, ip, #0
+		movs	ip, ip, lsr #32 - PAGE_SHIFT
+		beq	.Lcfu_3fupi
+		cmp	r2, ip
+		movlt	ip, r2
+		sub	r2, r2, ip
+		subs	ip, ip, #16
+		blt	.Lcfu_3rem8lp
+
+.Lcfu_3cpy8lp:	mov	r3, r7, pull #24
+		ldmia	r1!, {r4 - r7}			@ Shouldnt fault
+		orr	r3, r3, r4, push #8
+		mov	r4, r4, pull #24
+		orr	r4, r4, r5, push #8
+		mov	r5, r5, pull #24
+		orr	r5, r5, r6, push #8
+		mov	r6, r6, pull #24
+		orr	r6, r6, r7, push #8
+		stmia	r0!, {r3 - r6}
+		subs	ip, ip, #16
+		bpl	.Lcfu_3cpy8lp
+
+.Lcfu_3rem8lp:	tst	ip, #8
+		movne	r3, r7, pull #24
+		ldmneia	r1!, {r4, r7}			@ Shouldnt fault
+		orrne	r3, r3, r4, push #8
+		movne	r4, r4, pull #24
+		orrne	r4, r4, r7, push #8
+		stmneia	r0!, {r3 - r4}
+		tst	ip, #4
+		movne	r3, r7, pull #24
+USER(	TUSER(	ldrne) r7, [r1], #4)			@ May fault
+		orrne	r3, r3, r7, push #8
+		strne	r3, [r0], #4
+		ands	ip, ip, #3
+		beq	.Lcfu_3fupi
+.Lcfu_3nowords:	mov	r3, r7, get_byte_3
+		teq	ip, #0
+		beq	.Lcfu_finished
+		cmp	ip, #2
+		strb	r3, [r0], #1
+USER(	TUSER(	ldrgeb) r3, [r1], #1)			@ May fault
+		strgeb	r3, [r0], #1
+USER(	TUSER(	ldrgtb) r3, [r1], #1)			@ May fault
+		strgtb	r3, [r0], #1
+		b	.Lcfu_finished
+ENDPROC(__copy_from_user)
+
+		.pushsection .fixup,"ax"
+		.align	0
+		/*
+		 * We took an exception.  r0 contains a pointer to
+		 * the byte not copied.
+		 */
+9001:		ldr	r2, [sp], #4			@ void *to
+		sub	r2, r0, r2			@ bytes copied
+		ldr	r1, [sp], #4			@ unsigned long count
+		subs	r4, r1, r2			@ bytes left to copy
+		movne	r1, r4
+		blne	__memzero
+		mov	r0, r4
+		ldmfd	sp!, {r4 - r7, pc}
+		.popsection
+
--- a/arch/arm/lib/uaccess_with_memcpy.c
+++ b/arch/arm/lib/uaccess_with_memcpy.c
@@ -0,0 +1,235 @@
+/*
+ *  linux/arch/arm/lib/uaccess_with_memcpy.c
+ *
+ *  Written by: Lennert Buytenhek and Nicolas Pitre
+ *  Copyright (C) 2009 Marvell Semiconductor
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/uaccess.h>
+#include <linux/rwsem.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/hardirq.h> /* for in_atomic() */
+#include <linux/gfp.h>
+#include <linux/highmem.h>
+#include <asm/current.h>
+#include <asm/page.h>
+
+static int
+pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp)
+{
+	unsigned long addr = (unsigned long)_addr;
+	pgd_t *pgd;
+	pmd_t *pmd;
+	pte_t *pte;
+	pud_t *pud;
+	spinlock_t *ptl;
+
+	pgd = pgd_offset(current->mm, addr);
+	if (unlikely(pgd_none(*pgd) || pgd_bad(*pgd)))
+		return 0;
+
+	pud = pud_offset(pgd, addr);
+	if (unlikely(pud_none(*pud) || pud_bad(*pud)))
+		return 0;
+
+	pmd = pmd_offset(pud, addr);
+	if (unlikely(pmd_none(*pmd) || pmd_bad(*pmd)))
+		return 0;
+
+	pte = pte_offset_map_lock(current->mm, pmd, addr, &ptl);
+	if (unlikely(!pte_present(*pte) || !pte_young(*pte) ||
+	    !pte_write(*pte) || !pte_dirty(*pte))) {
+		pte_unmap_unlock(pte, ptl);
+		return 0;
+	}
+
+	*ptep = pte;
+	*ptlp = ptl;
+
+	return 1;
+}
+
+static unsigned long noinline
+__copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
+{
+	int atomic;
+
+	if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
+		memcpy((void *)to, from, n);
+		return 0;
+	}
+
+	/* the mmap semaphore is taken only if not in an atomic context */
+	atomic = in_atomic();
+
+	if (!atomic)
+		down_read(&current->mm->mmap_sem);
+	while (n) {
+		pte_t *pte;
+		spinlock_t *ptl;
+		int tocopy;
+
+		while (!pin_page_for_write(to, &pte, &ptl)) {
+			if (!atomic)
+				up_read(&current->mm->mmap_sem);
+			if (__put_user(0, (char __user *)to))
+				goto out;
+			if (!atomic)
+				down_read(&current->mm->mmap_sem);
+		}
+
+		tocopy = (~(unsigned long)to & ~PAGE_MASK) + 1;
+		if (tocopy > n)
+			tocopy = n;
+
+		memcpy((void *)to, from, tocopy);
+		to += tocopy;
+		from += tocopy;
+		n -= tocopy;
+
+		pte_unmap_unlock(pte, ptl);
+	}
+	if (!atomic)
+		up_read(&current->mm->mmap_sem);
+
+out:
+	return n;
+}
+
+unsigned long
+__copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+	/*
+	 * This test is stubbed out of the main function above to keep
+	 * the overhead for small copies low by avoiding a large
+	 * register dump on the stack just to reload them right away.
+	 * With frame pointer disabled, tail call optimization kicks in
+	 * as well making this test almost invisible.
+	 */
+	if (n < 64)
+		return __copy_to_user_std(to, from, n);
+	return __copy_to_user_memcpy(to, from, n);
+}
+	
+static unsigned long noinline
+__clear_user_memset(void __user *addr, unsigned long n)
+{
+	if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
+		memset((void *)addr, 0, n);
+		return 0;
+	}
+
+	down_read(&current->mm->mmap_sem);
+	while (n) {
+		pte_t *pte;
+		spinlock_t *ptl;
+		int tocopy;
+
+		while (!pin_page_for_write(addr, &pte, &ptl)) {
+			up_read(&current->mm->mmap_sem);
+			if (__put_user(0, (char __user *)addr))
+				goto out;
+			down_read(&current->mm->mmap_sem);
+		}
+
+		tocopy = (~(unsigned long)addr & ~PAGE_MASK) + 1;
+		if (tocopy > n)
+			tocopy = n;
+
+		memset((void *)addr, 0, tocopy);
+		addr += tocopy;
+		n -= tocopy;
+
+		pte_unmap_unlock(pte, ptl);
+	}
+	up_read(&current->mm->mmap_sem);
+
+out:
+	return n;
+}
+
+unsigned long __clear_user(void __user *addr, unsigned long n)
+{
+	/* See rational for this in __copy_to_user() above. */
+	if (n < 64)
+		return __clear_user_std(addr, n);
+	return __clear_user_memset(addr, n);
+}
+
+#if 0
+
+/*
+ * This code is disabled by default, but kept around in case the chosen
+ * thresholds need to be revalidated.  Some overhead (small but still)
+ * would be implied by a runtime determined variable threshold, and
+ * so far the measurement on concerned targets didn't show a worthwhile
+ * variation.
+ *
+ * Note that a fairly precise sched_clock() implementation is needed
+ * for results to make some sense.
+ */
+
+#include <linux/vmalloc.h>
+
+static int __init test_size_treshold(void)
+{
+	struct page *src_page, *dst_page;
+	void *user_ptr, *kernel_ptr;
+	unsigned long long t0, t1, t2;
+	int size, ret;
+
+	ret = -ENOMEM;
+	src_page = alloc_page(GFP_KERNEL);
+	if (!src_page)
+		goto no_src;
+	dst_page = alloc_page(GFP_KERNEL);
+	if (!dst_page)
+		goto no_dst;
+	kernel_ptr = page_address(src_page);
+	user_ptr = vmap(&dst_page, 1, VM_IOREMAP, __pgprot(__P010));
+	if (!user_ptr)
+		goto no_vmap;
+
+	/* warm up the src page dcache */
+	ret = __copy_to_user_memcpy(user_ptr, kernel_ptr, PAGE_SIZE);
+
+	for (size = PAGE_SIZE; size >= 4; size /= 2) {
+		t0 = sched_clock();
+		ret |= __copy_to_user_memcpy(user_ptr, kernel_ptr, size);
+		t1 = sched_clock();
+		ret |= __copy_to_user_std(user_ptr, kernel_ptr, size);
+		t2 = sched_clock();
+		printk("copy_to_user: %d %llu %llu\n", size, t1 - t0, t2 - t1);
+	}
+
+	for (size = PAGE_SIZE; size >= 4; size /= 2) {
+		t0 = sched_clock();
+		ret |= __clear_user_memset(user_ptr, size);
+		t1 = sched_clock();
+		ret |= __clear_user_std(user_ptr, size);
+		t2 = sched_clock();
+		printk("clear_user: %d %llu %llu\n", size, t1 - t0, t2 - t1);
+	}
+
+	if (ret)
+		ret = -EFAULT;
+
+	vunmap(user_ptr);
+no_vmap:
+	put_page(dst_page);
+no_dst:
+	put_page(src_page);
+no_src:
+	return ret;
+}
+
+subsys_initcall(test_size_treshold);
+
+#endif
--- a/arch/arm/lib/ucmpdi2.S
+++ b/arch/arm/lib/ucmpdi2.S
@@ -0,0 +1,52 @@
+/*
+ *  linux/arch/arm/lib/ucmpdi2.S
+ *
+ *  Author:	Nicolas Pitre
+ *  Created:	Oct 19, 2005
+ *  Copyright:	Monta Vista Software, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+
+#ifdef __ARMEB__
+#define xh r0
+#define xl r1
+#define yh r2
+#define yl r3
+#else
+#define xl r0
+#define xh r1
+#define yl r2
+#define yh r3
+#endif
+
+ENTRY(__ucmpdi2)
+
+	cmp	xh, yh
+	cmpeq	xl, yl
+	movlo	r0, #0
+	moveq	r0, #1
+	movhi	r0, #2
+	mov	pc, lr
+
+ENDPROC(__ucmpdi2)
+
+#ifdef CONFIG_AEABI
+
+ENTRY(__aeabi_ulcmp)
+
+	cmp	xh, yh
+	cmpeq	xl, yl
+	movlo	r0, #-1
+	moveq	r0, #0
+	movhi	r0, #1
+	mov	pc, lr
+
+ENDPROC(__aeabi_ulcmp)
+
+#endif
+