[dev.link] all: merge branch 'master' into dev.linkdev.link

Clean merge. Change-Id: Ia7b2808bc649790198d34c226a61d9e569084dc5
author: Cherry Zhang <cherryyz@google.com> 2020-10-28 09:12:20 -0400
committer: Cherry Zhang <cherryyz@google.com> 2020-10-28 09:12:20 -0400
commit: a16e30d162c1c7408db7821e7b9513cefa09c6ca (patch)
tree: af752ba9ba44c547df39bb0af9bff79f610ba9d5 /src/cmd/compile/internal/ssa/gen
parent: 91e4d2d57bc341dd82c98247117114c851380aef (diff)
parent: cf6cfba4d5358404dd890f6025e573a4b2156543 (diff)
download: go-git-dev.link.tar.gz
19 files changed, 521 insertions, 331 deletions
diff --git a/src/cmd/compile/internal/ssa/gen/386.rules b/src/cmd/compile/internal/ssa/gen/386.rules
index 4a8244eb27..4e6cc8c692 100644
--- a/src/cmd/compile/internal/ssa/gen/386.rules
+++ b/src/cmd/compile/internal/ssa/gen/386.rules
@@ -38,10 +38,8 @@
 (Xor(32|16|8) ...) => (XORL ...)
 
 (Neg(32|16|8) ...) => (NEGL ...)
-(Neg32F x) && !config.use387 => (PXOR x (MOVSSconst <typ.Float32> [float32(math.Copysign(0, -1))]))
-(Neg64F x) && !config.use387 => (PXOR x (MOVSDconst <typ.Float64> [math.Copysign(0, -1)]))
-(Neg32F x) && config.use387 => (FCHS x)
-(Neg64F x) && config.use387 => (FCHS x)
+(Neg32F x) => (PXOR x (MOVSSconst <typ.Float32> [float32(math.Copysign(0, -1))]))
+(Neg64F x) => (PXOR x (MOVSDconst <typ.Float64> [math.Copysign(0, -1)]))
 
 (Com(32|16|8) ...) => (NOTL ...)
 
@@ -312,7 +310,7 @@
 (Const32  ...) => (MOVLconst ...)
 (Const(32|64)F ...) => (MOVS(S|D)const ...)
 (ConstNil) => (MOVLconst [0])
-(ConstBool [c]) => (MOVLconst [int32(b2i(c))])
+(ConstBool [c]) => (MOVLconst [b2i32(c)])
 
 // Lowering calls
 (StaticCall ...) => (CALLstatic ...)
@@ -670,8 +668,8 @@
 
 // Merge load/store to op
 ((ADD|AND|OR|XOR|SUB|MUL)L x l:(MOVLload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) => ((ADD|AND|OR|XOR|SUB|MUL)Lload x [off] {sym} ptr mem)
-((ADD|SUB|MUL|DIV)SD x l:(MOVSDload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && !config.use387 && clobber(l) => ((ADD|SUB|MUL|DIV)SDload x [off] {sym} ptr mem)
-((ADD|SUB|MUL|DIV)SS x l:(MOVSSload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && !config.use387 && clobber(l) => ((ADD|SUB|MUL|DIV)SSload x [off] {sym} ptr mem)
+((ADD|SUB|MUL|DIV)SD x l:(MOVSDload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) => ((ADD|SUB|MUL|DIV)SDload x [off] {sym} ptr mem)
+((ADD|SUB|MUL|DIV)SS x l:(MOVSSload [off] {sym} ptr mem)) && canMergeLoadClobber(v, l, x) && clobber(l) => ((ADD|SUB|MUL|DIV)SSload x [off] {sym} ptr mem)
 (MOVLstore {sym} [off] ptr y:((ADD|AND|OR|XOR)Lload x [off] {sym} ptr mem) mem) && y.Uses==1 && clobber(y) => ((ADD|AND|OR|XOR)Lmodify [off] {sym} ptr x mem)
 (MOVLstore {sym} [off] ptr y:((ADD|SUB|AND|OR|XOR)L l:(MOVLload [off] {sym} ptr mem) x) mem) && y.Uses==1 && l.Uses==1 && clobber(y, l) =>
 	((ADD|SUB|AND|OR|XOR)Lmodify [off] {sym} ptr x mem)
diff --git a/src/cmd/compile/internal/ssa/gen/386Ops.go b/src/cmd/compile/internal/ssa/gen/386Ops.go
index ddabde7d3d..737b99c371 100644
--- a/src/cmd/compile/internal/ssa/gen/386Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/386Ops.go
@@ -51,17 +51,6 @@ var regNames386 = []string{
 	"SB",
 }
 
-// Notes on 387 support.
-//  - The 387 has a weird stack-register setup for floating-point registers.
-//    We use these registers when SSE registers are not available (when GO386=387).
-//  - We use the same register names (X0-X7) but they refer to the 387
-//    floating-point registers. That way, most of the SSA backend is unchanged.
-//  - The instruction generation pass maintains an SSE->387 register mapping.
-//    This mapping is updated whenever the FP stack is pushed or popped so that
-//    we can always find a given SSE register even when the TOS pointer has changed.
-//  - To facilitate the mapping from SSE to 387, we enforce that
-//    every basic block starts and ends with an empty floating-point stack.
-
 func init() {
 	// Make map from reg names to reg integers.
 	if len(regNames386) > 64 {
@@ -552,9 +541,6 @@ func init() {
 		{name: "FlagGT_UGT"}, // signed > and unsigned <
 		{name: "FlagGT_ULT"}, // signed > and unsigned >
 
-		// Special op for -x on 387
-		{name: "FCHS", argLength: 1, reg: fp11},
-
 		// Special ops for PIC floating-point constants.
 		// MOVSXconst1 loads the address of the constant-pool entry into a register.
 		// MOVSXconst2 loads the constant from that address.
diff --git a/src/cmd/compile/internal/ssa/gen/AMD64.rules b/src/cmd/compile/internal/ssa/gen/AMD64.rules
index 408678f054..934e7dfdb6 100644
--- a/src/cmd/compile/internal/ssa/gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules
@@ -401,7 +401,7 @@
 (Const32F ...) => (MOVSSconst ...)
 (Const64F ...) => (MOVSDconst ...)
 (ConstNil    ) => (MOVQconst [0])
-(ConstBool [c]) => (MOVLconst [int32(b2i(c))])
+(ConstBool [c]) => (MOVLconst [b2i32(c)])
 
 // Lowering calls
 (StaticCall ...) => (CALLstatic ...)
@@ -530,8 +530,10 @@
 (AtomicCompareAndSwap64 ptr old new_ mem) => (CMPXCHGQlock ptr old new_ mem)
 
 // Atomic memory updates.
-(AtomicAnd8 ptr val mem) => (ANDBlock ptr val mem)
-(AtomicOr8 ptr val mem) => (ORBlock ptr val mem)
+(AtomicAnd8  ptr val mem) => (ANDBlock ptr val mem)
+(AtomicAnd32 ptr val mem) => (ANDLlock ptr val mem)
+(AtomicOr8   ptr val mem) => (ORBlock  ptr val mem)
+(AtomicOr32  ptr val mem) => (ORLlock  ptr val mem)
 
 // Write barrier.
 (WB ...) => (LoweredWB ...)
@@ -957,7 +959,7 @@
 (MUL(Q|L)const [73] x) => (LEA(Q|L)8 x (LEA(Q|L)8 <v.Type> x x))
 (MUL(Q|L)const [81] x) => (LEA(Q|L)8 (LEA(Q|L)8 <v.Type> x x) (LEA(Q|L)8 <v.Type> x x))
 
-(MUL(Q|L)const [c] x) && isPowerOfTwo(int64(c)+1) && c >=  15 => (SUB(Q|L)  (SHL(Q|L)const <v.Type> [int8(log2(int64(c)+1))] x) x)
+(MUL(Q|L)const [c] x) && isPowerOfTwo64(int64(c)+1) && c >=  15 => (SUB(Q|L)  (SHL(Q|L)const <v.Type> [int8(log2(int64(c)+1))] x) x)
 (MUL(Q|L)const [c] x) && isPowerOfTwo32(c-1) && c >=  17 => (LEA(Q|L)1 (SHL(Q|L)const <v.Type> [int8(log32(c-1))] x) x)
 (MUL(Q|L)const [c] x) && isPowerOfTwo32(c-2) && c >=  34 => (LEA(Q|L)2 (SHL(Q|L)const <v.Type> [int8(log32(c-2))] x) x)
 (MUL(Q|L)const [c] x) && isPowerOfTwo32(c-4) && c >=  68 => (LEA(Q|L)4 (SHL(Q|L)const <v.Type> [int8(log32(c-4))] x) x)
@@ -1274,8 +1276,8 @@
 (CMPQconst (ANDQconst _ [m]) [n]) && 0 <= m && m < n => (FlagLT_ULT)
 (CMPQconst (ANDLconst _ [m]) [n]) && 0 <= m && m < n => (FlagLT_ULT)
 (CMPLconst (ANDLconst _ [m]) [n]) && 0 <= m && m < n => (FlagLT_ULT)
-(CMPWconst (ANDLconst _ [m]) [n]) && 0 <= m && int16(m) < n => (FlagLT_ULT)
-(CMPBconst (ANDLconst _ [m]) [n]) && 0 <= m && int8(m)  < n => (FlagLT_ULT)
+(CMPWconst (ANDLconst _ [m]) [n]) && 0 <= int16(m) && int16(m) < n => (FlagLT_ULT)
+(CMPBconst (ANDLconst _ [m]) [n]) && 0 <= int8(m)  && int8(m)  < n => (FlagLT_ULT)
 
 // TESTQ c c sets flags like CMPQ c 0.
 (TESTQconst [c] (MOVQconst [d])) && int64(c) == d && c == 0 => (FlagEQ)
diff --git a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
index 2df5016d59..de5372670b 100644
--- a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go
@@ -902,7 +902,9 @@ func init() {
 
 		// Atomic memory updates.
 		{name: "ANDBlock", argLength: 3, reg: gpstore, asm: "ANDB", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, // *(arg0+auxint+aux) &= arg1
+		{name: "ANDLlock", argLength: 3, reg: gpstore, asm: "ANDL", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"}, // *(arg0+auxint+aux) &= arg1
 		{name: "ORBlock", argLength: 3, reg: gpstore, asm: "ORB", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"},   // *(arg0+auxint+aux) |= arg1
+		{name: "ORLlock", argLength: 3, reg: gpstore, asm: "ORL", aux: "SymOff", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true, symEffect: "RdWr"},   // *(arg0+auxint+aux) |= arg1
 	}
 
 	var AMD64blocks = []blockData{
diff --git a/src/cmd/compile/internal/ssa/gen/ARM.rules b/src/cmd/compile/internal/ssa/gen/ARM.rules
index 9490805f46..f48abcd202 100644
--- a/src/cmd/compile/internal/ssa/gen/ARM.rules
+++ b/src/cmd/compile/internal/ssa/gen/ARM.rules
@@ -169,10 +169,10 @@
 (Rsh8x64 x (Const64 [c])) && uint64(c) >= 8 => (SRAconst (SLLconst <typ.UInt32> x [24]) [31])
 
 // constants
-(Const(8|16|32) ...) -> (MOVWconst ...)
-(Const(32F|64F) ...) -> (MOV(F|D)const ...)
+(Const(8|16|32) [val]) => (MOVWconst [int32(val)])
+(Const(32|64)F [val]) => (MOV(F|D)const [float64(val)])
 (ConstNil) => (MOVWconst [0])
-(ConstBool ...) -> (MOVWconst ...)
+(ConstBool [b]) => (MOVWconst [b2i32(b)])
 
 // truncations
 // Because we ignore high parts of registers, truncates are just copies.
@@ -243,10 +243,10 @@
 (Leq16U x y) => (LessEqualU (CMP (ZeroExt16to32 x) (ZeroExt16to32 y)))
 (Leq32U x y) => (LessEqualU (CMP x y))
 
-(OffPtr [off] ptr:(SP)) -> (MOVWaddr [off] ptr)
-(OffPtr [off] ptr) -> (ADDconst [off] ptr)
+(OffPtr [off] ptr:(SP)) => (MOVWaddr [int32(off)] ptr)
+(OffPtr [off] ptr) => (ADDconst [int32(off)] ptr)
 
-(Addr ...) -> (MOVWaddr ...)
+(Addr {sym} base) => (MOVWaddr {sym} base)
 (LocalAddr {sym} base _) => (MOVWaddr {sym} base)
 
 // loads
@@ -1052,8 +1052,8 @@
 (BICshiftRL x (MOVWconst [c]) [d]) => (BICconst x [int32(uint32(c)>>uint64(d))])
 (BICshiftRA x (MOVWconst [c]) [d]) => (BICconst x [c>>uint64(d)])
 (MVNshiftLL (MOVWconst [c]) [d]) => (MOVWconst [^(c<<uint64(d))])
-(MVNshiftRL (MOVWconst [c]) [d]) -> (MOVWconst [^int64(uint32(c)>>uint64(d))])
-(MVNshiftRA (MOVWconst [c]) [d]) -> (MOVWconst [^int64(int32(c)>>uint64(d))])
+(MVNshiftRL (MOVWconst [c]) [d]) => (MOVWconst [^int32(uint32(c)>>uint64(d))])
+(MVNshiftRA (MOVWconst [c]) [d]) => (MOVWconst [int32(c)>>uint64(d)])
 (CMPshiftLL x (MOVWconst [c]) [d]) => (CMPconst x [c<<uint64(d)])
 (CMPshiftRL x (MOVWconst [c]) [d]) => (CMPconst x [int32(uint32(c)>>uint64(d))])
 (CMPshiftRA x (MOVWconst [c]) [d]) => (CMPconst x [c>>uint64(d)])
@@ -1190,12 +1190,12 @@
 (MOVWstoreidx ptr (SRAconst idx [c]) val mem) => (MOVWstoreshiftRA ptr idx [c] val mem)
 (MOVWstoreidx (SRAconst idx [c]) ptr val mem) => (MOVWstoreshiftRA ptr idx [c] val mem)
 
-(MOVWloadshiftLL ptr (MOVWconst [c]) [d] mem) -> (MOVWload [int64(uint32(c)<<uint64(d))] ptr mem)
-(MOVWloadshiftRL ptr (MOVWconst [c]) [d] mem) -> (MOVWload [int64(uint32(c)>>uint64(d))] ptr mem)
+(MOVWloadshiftLL ptr (MOVWconst [c]) [d] mem) => (MOVWload [int32(uint32(c)<<uint64(d))] ptr mem)
+(MOVWloadshiftRL ptr (MOVWconst [c]) [d] mem) => (MOVWload [int32(uint32(c)>>uint64(d))] ptr mem)
 (MOVWloadshiftRA ptr (MOVWconst [c]) [d] mem) => (MOVWload [c>>uint64(d)] ptr mem)
 
-(MOVWstoreshiftLL ptr (MOVWconst [c]) [d] val mem) -> (MOVWstore [int64(uint32(c)<<uint64(d))] ptr val mem)
-(MOVWstoreshiftRL ptr (MOVWconst [c]) [d] val mem) -> (MOVWstore [int64(uint32(c)>>uint64(d))] ptr val mem)
+(MOVWstoreshiftLL ptr (MOVWconst [c]) [d] val mem) => (MOVWstore [int32(uint32(c)<<uint64(d))] ptr val mem)
+(MOVWstoreshiftRL ptr (MOVWconst [c]) [d] val mem) => (MOVWstore [int32(uint32(c)>>uint64(d))] ptr val mem)
 (MOVWstoreshiftRA ptr (MOVWconst [c]) [d] val mem) => (MOVWstore [c>>uint64(d)] ptr val mem)
 
 // generic simplifications
@@ -1263,8 +1263,8 @@
 (SRLconst (SLLconst x [c]) [d]) && objabi.GOARM==7 && uint64(d)>=uint64(c) && uint64(d)<=31 => (BFXU [(d-c)|(32-d)<<8] x)
 
 // comparison simplification
-(CMP x (RSBconst [0] y)) => (CMN x y)
-(CMN x (RSBconst [0] y)) => (CMP x y)
+((LT|LE|EQ|NE|GE|GT) (CMP x (RSBconst [0] y))) => ((LT|LE|EQ|NE|GE|GT) (CMN x y)) // sense of carry bit not preserved
+((LT|LE|EQ|NE|GE|GT) (CMN x (RSBconst [0] y))) => ((LT|LE|EQ|NE|GE|GT) (CMP x y)) // sense of carry bit not preserved
 (EQ (CMPconst [0] l:(SUB x y)) yes no) && l.Uses==1 => (EQ (CMP x y) yes no)
 (EQ (CMPconst [0] l:(MULS x y a)) yes no) && l.Uses==1 => (EQ (CMP a (MUL <x.Type> x y)) yes no)
 (EQ (CMPconst [0] l:(SUBconst [c] x)) yes no) && l.Uses==1 => (EQ (CMPconst [c] x) yes no)
@@ -1470,6 +1470,6 @@
 (GE (CMPconst [0] l:(XORshiftRLreg x y z)) yes no) && l.Uses==1 => (GE (TEQshiftRLreg x y z) yes no)
 (GE (CMPconst [0] l:(XORshiftRAreg x y z)) yes no) && l.Uses==1 => (GE (TEQshiftRAreg x y z) yes no)
 
-(MOVBUload [off] {sym} (SB) _) && symIsRO(sym) -> (MOVWconst [int64(read8(sym, off))])
-(MOVHUload [off] {sym} (SB) _) && symIsRO(sym) -> (MOVWconst [int64(read16(sym, off, config.ctxt.Arch.ByteOrder))])
-(MOVWload [off] {sym} (SB) _) && symIsRO(sym) -> (MOVWconst [int64(int32(read32(sym, off, config.ctxt.Arch.ByteOrder)))])
+(MOVBUload [off] {sym} (SB) _) && symIsRO(sym) => (MOVWconst [int32(read8(sym, int64(off)))])
+(MOVHUload [off] {sym} (SB) _) && symIsRO(sym) => (MOVWconst [int32(read16(sym, int64(off), config.ctxt.Arch.ByteOrder))])
+(MOVWload [off] {sym} (SB) _) && symIsRO(sym) => (MOVWconst [int32(read32(sym, int64(off), config.ctxt.Arch.ByteOrder))])
diff --git a/src/cmd/compile/internal/ssa/gen/ARM64.rules b/src/cmd/compile/internal/ssa/gen/ARM64.rules
index c4a3532632..c50a8c7778 100644
--- a/src/cmd/compile/internal/ssa/gen/ARM64.rules
+++ b/src/cmd/compile/internal/ssa/gen/ARM64.rules
@@ -548,8 +548,10 @@
 (AtomicCompareAndSwap(32|64) ...) => (LoweredAtomicCas(32|64) ...)
 
 // Currently the updated value is not used, but we need a register to temporarily hold it.
-(AtomicAnd8 ptr val mem) => (Select1 (LoweredAtomicAnd8 ptr val mem))
-(AtomicOr8  ptr val mem) => (Select1 (LoweredAtomicOr8  ptr val mem))
+(AtomicAnd8  ptr val mem) => (Select1 (LoweredAtomicAnd8  ptr val mem))
+(AtomicAnd32 ptr val mem) => (Select1 (LoweredAtomicAnd32 ptr val mem))
+(AtomicOr8   ptr val mem) => (Select1 (LoweredAtomicOr8   ptr val mem))
+(AtomicOr32  ptr val mem) => (Select1 (LoweredAtomicOr32  ptr val mem))
 
 (AtomicAdd(32|64)Variant ...) => (LoweredAtomicAdd(32|64)Variant ...)
 
@@ -1171,145 +1173,145 @@
 (MUL x (MOVDconst [-1])) => (NEG x)
 (MUL _ (MOVDconst [0])) => (MOVDconst [0])
 (MUL x (MOVDconst [1])) => x
-(MUL x (MOVDconst [c])) && isPowerOfTwo(c) => (SLLconst [log2(c)] x)
-(MUL x (MOVDconst [c])) && isPowerOfTwo(c-1) && c >= 3 => (ADDshiftLL x x [log2(c-1)])
-(MUL x (MOVDconst [c])) && isPowerOfTwo(c+1) && c >= 7 => (ADDshiftLL (NEG <x.Type> x) x [log2(c+1)])
-(MUL x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo(c/3) => (SLLconst [log2(c/3)] (ADDshiftLL <x.Type> x x [1]))
-(MUL x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo(c/5) => (SLLconst [log2(c/5)] (ADDshiftLL <x.Type> x x [2]))
-(MUL x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo(c/7) => (SLLconst [log2(c/7)] (ADDshiftLL <x.Type> (NEG <x.Type> x) x [3]))
-(MUL x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo(c/9) => (SLLconst [log2(c/9)] (ADDshiftLL <x.Type> x x [3]))
+(MUL x (MOVDconst [c])) && isPowerOfTwo64(c) => (SLLconst [log2(c)] x)
+(MUL x (MOVDconst [c])) && isPowerOfTwo64(c-1) && c >= 3 => (ADDshiftLL x x [log2(c-1)])
+(MUL x (MOVDconst [c])) && isPowerOfTwo64(c+1) && c >= 7 => (ADDshiftLL (NEG <x.Type> x) x [log2(c+1)])
+(MUL x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) => (SLLconst [log2(c/3)] (ADDshiftLL <x.Type> x x [1]))
+(MUL x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) => (SLLconst [log2(c/5)] (ADDshiftLL <x.Type> x x [2]))
+(MUL x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) => (SLLconst [log2(c/7)] (ADDshiftLL <x.Type> (NEG <x.Type> x) x [3]))
+(MUL x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) => (SLLconst [log2(c/9)] (ADDshiftLL <x.Type> x x [3]))
 
 (MULW x (MOVDconst [c])) && int32(c)==-1 => (NEG x)
 (MULW _ (MOVDconst [c])) && int32(c)==0 => (MOVDconst [0])
 (MULW x (MOVDconst [c])) && int32(c)==1 => x
-(MULW x (MOVDconst [c])) && isPowerOfTwo(c) => (SLLconst [log2(c)] x)
-(MULW x (MOVDconst [c])) && isPowerOfTwo(c-1) && int32(c) >= 3 => (ADDshiftLL x x [log2(c-1)])
-(MULW x (MOVDconst [c])) && isPowerOfTwo(c+1) && int32(c) >= 7 => (ADDshiftLL (NEG <x.Type> x) x [log2(c+1)])
-(MULW x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo(c/3) && is32Bit(c) => (SLLconst [log2(c/3)] (ADDshiftLL <x.Type> x x [1]))
-(MULW x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo(c/5) && is32Bit(c) => (SLLconst [log2(c/5)] (ADDshiftLL <x.Type> x x [2]))
-(MULW x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo(c/7) && is32Bit(c) => (SLLconst [log2(c/7)] (ADDshiftLL <x.Type> (NEG <x.Type> x) x [3]))
-(MULW x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo(c/9) && is32Bit(c) => (SLLconst [log2(c/9)] (ADDshiftLL <x.Type> x x [3]))
+(MULW x (MOVDconst [c])) && isPowerOfTwo64(c) => (SLLconst [log2(c)] x)
+(MULW x (MOVDconst [c])) && isPowerOfTwo64(c-1) && int32(c) >= 3 => (ADDshiftLL x x [log2(c-1)])
+(MULW x (MOVDconst [c])) && isPowerOfTwo64(c+1) && int32(c) >= 7 => (ADDshiftLL (NEG <x.Type> x) x [log2(c+1)])
+(MULW x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) && is32Bit(c) => (SLLconst [log2(c/3)] (ADDshiftLL <x.Type> x x [1]))
+(MULW x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) && is32Bit(c) => (SLLconst [log2(c/5)] (ADDshiftLL <x.Type> x x [2]))
+(MULW x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) && is32Bit(c) => (SLLconst [log2(c/7)] (ADDshiftLL <x.Type> (NEG <x.Type> x) x [3]))
+(MULW x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) && is32Bit(c) => (SLLconst [log2(c/9)] (ADDshiftLL <x.Type> x x [3]))
 
 // mneg by constant
 (MNEG x (MOVDconst [-1])) => x
 (MNEG _ (MOVDconst [0])) => (MOVDconst [0])
 (MNEG x (MOVDconst [1])) => (NEG x)
-(MNEG x (MOVDconst [c])) && isPowerOfTwo(c) => (NEG (SLLconst <x.Type> [log2(c)] x))
-(MNEG x (MOVDconst [c])) && isPowerOfTwo(c-1) && c >= 3 => (NEG (ADDshiftLL <x.Type> x x [log2(c-1)]))
-(MNEG x (MOVDconst [c])) && isPowerOfTwo(c+1) && c >= 7 => (NEG (ADDshiftLL <x.Type> (NEG <x.Type> x) x [log2(c+1)]))
-(MNEG x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo(c/3) => (SLLconst <x.Type> [log2(c/3)] (SUBshiftLL <x.Type> x x [2]))
-(MNEG x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo(c/5) => (NEG (SLLconst <x.Type> [log2(c/5)] (ADDshiftLL <x.Type> x x [2])))
-(MNEG x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo(c/7) => (SLLconst <x.Type> [log2(c/7)] (SUBshiftLL <x.Type> x x [3]))
-(MNEG x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo(c/9) => (NEG (SLLconst <x.Type> [log2(c/9)] (ADDshiftLL <x.Type> x x [3])))
+(MNEG x (MOVDconst [c])) && isPowerOfTwo64(c) => (NEG (SLLconst <x.Type> [log2(c)] x))
+(MNEG x (MOVDconst [c])) && isPowerOfTwo64(c-1) && c >= 3 => (NEG (ADDshiftLL <x.Type> x x [log2(c-1)]))
+(MNEG x (MOVDconst [c])) && isPowerOfTwo64(c+1) && c >= 7 => (NEG (ADDshiftLL <x.Type> (NEG <x.Type> x) x [log2(c+1)]))
+(MNEG x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) => (SLLconst <x.Type> [log2(c/3)] (SUBshiftLL <x.Type> x x [2]))
+(MNEG x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) => (NEG (SLLconst <x.Type> [log2(c/5)] (ADDshiftLL <x.Type> x x [2])))
+(MNEG x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) => (SLLconst <x.Type> [log2(c/7)] (SUBshiftLL <x.Type> x x [3]))
+(MNEG x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) => (NEG (SLLconst <x.Type> [log2(c/9)] (ADDshiftLL <x.Type> x x [3])))
 
 (MNEGW x (MOVDconst [c])) && int32(c)==-1 => x
 (MNEGW _ (MOVDconst [c])) && int32(c)==0 => (MOVDconst [0])
 (MNEGW x (MOVDconst [c])) && int32(c)==1 => (NEG x)
-(MNEGW x (MOVDconst [c])) && isPowerOfTwo(c) => (NEG (SLLconst <x.Type> [log2(c)] x))
-(MNEGW x (MOVDconst [c])) && isPowerOfTwo(c-1) && int32(c) >= 3 => (NEG (ADDshiftLL <x.Type> x x [log2(c-1)]))
-(MNEGW x (MOVDconst [c])) && isPowerOfTwo(c+1) && int32(c) >= 7 => (NEG (ADDshiftLL <x.Type> (NEG <x.Type> x) x [log2(c+1)]))
-(MNEGW x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo(c/3) && is32Bit(c) => (SLLconst <x.Type> [log2(c/3)] (SUBshiftLL <x.Type> x x [2]))
-(MNEGW x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo(c/5) && is32Bit(c) => (NEG (SLLconst <x.Type> [log2(c/5)] (ADDshiftLL <x.Type> x x [2])))
-(MNEGW x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo(c/7) && is32Bit(c) => (SLLconst <x.Type> [log2(c/7)] (SUBshiftLL <x.Type> x x [3]))
-(MNEGW x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo(c/9) && is32Bit(c) => (NEG (SLLconst <x.Type> [log2(c/9)] (ADDshiftLL <x.Type> x x [3])))
+(MNEGW x (MOVDconst [c])) && isPowerOfTwo64(c) => (NEG (SLLconst <x.Type> [log2(c)] x))
+(MNEGW x (MOVDconst [c])) && isPowerOfTwo64(c-1) && int32(c) >= 3 => (NEG (ADDshiftLL <x.Type> x x [log2(c-1)]))
+(MNEGW x (MOVDconst [c])) && isPowerOfTwo64(c+1) && int32(c) >= 7 => (NEG (ADDshiftLL <x.Type> (NEG <x.Type> x) x [log2(c+1)]))
+(MNEGW x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) && is32Bit(c) => (SLLconst <x.Type> [log2(c/3)] (SUBshiftLL <x.Type> x x [2]))
+(MNEGW x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) && is32Bit(c) => (NEG (SLLconst <x.Type> [log2(c/5)] (ADDshiftLL <x.Type> x x [2])))
+(MNEGW x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) && is32Bit(c) => (SLLconst <x.Type> [log2(c/7)] (SUBshiftLL <x.Type> x x [3]))
+(MNEGW x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) && is32Bit(c) => (NEG (SLLconst <x.Type> [log2(c/9)] (ADDshiftLL <x.Type> x x [3])))
 
 (MADD a x (MOVDconst [-1])) => (SUB a x)
 (MADD a _ (MOVDconst [0])) => a
 (MADD a x (MOVDconst [1])) => (ADD a x)
-(MADD a x (MOVDconst [c])) && isPowerOfTwo(c) => (ADDshiftLL a x [log2(c)])
-(MADD a x (MOVDconst [c])) && isPowerOfTwo(c-1) && c>=3 => (ADD a (ADDshiftLL <x.Type> x x [log2(c-1)]))
-(MADD a x (MOVDconst [c])) && isPowerOfTwo(c+1) && c>=7 => (SUB a (SUBshiftLL <x.Type> x x [log2(c+1)]))
-(MADD a x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo(c/3) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [2]) [log2(c/3)])
-(MADD a x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo(c/5) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [2]) [log2(c/5)])
-(MADD a x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo(c/7) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [3]) [log2(c/7)])
-(MADD a x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo(c/9) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [3]) [log2(c/9)])
+(MADD a x (MOVDconst [c])) && isPowerOfTwo64(c) => (ADDshiftLL a x [log2(c)])
+(MADD a x (MOVDconst [c])) && isPowerOfTwo64(c-1) && c>=3 => (ADD a (ADDshiftLL <x.Type> x x [log2(c-1)]))
+(MADD a x (MOVDconst [c])) && isPowerOfTwo64(c+1) && c>=7 => (SUB a (SUBshiftLL <x.Type> x x [log2(c+1)]))
+(MADD a x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [2]) [log2(c/3)])
+(MADD a x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [2]) [log2(c/5)])
+(MADD a x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [3]) [log2(c/7)])
+(MADD a x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [3]) [log2(c/9)])
 
 (MADD a (MOVDconst [-1]) x) => (SUB a x)
 (MADD a (MOVDconst [0]) _) => a
 (MADD a (MOVDconst [1]) x) => (ADD a x)
-(MADD a (MOVDconst [c]) x) && isPowerOfTwo(c) => (ADDshiftLL a x [log2(c)])
-(MADD a (MOVDconst [c]) x) && isPowerOfTwo(c-1) && c>=3 => (ADD a (ADDshiftLL <x.Type> x x [log2(c-1)]))
-(MADD a (MOVDconst [c]) x) && isPowerOfTwo(c+1) && c>=7 => (SUB a (SUBshiftLL <x.Type> x x [log2(c+1)]))
-(MADD a (MOVDconst [c]) x) && c%3 == 0 && isPowerOfTwo(c/3) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [2]) [log2(c/3)])
-(MADD a (MOVDconst [c]) x) && c%5 == 0 && isPowerOfTwo(c/5) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [2]) [log2(c/5)])
-(MADD a (MOVDconst [c]) x) && c%7 == 0 && isPowerOfTwo(c/7) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [3]) [log2(c/7)])
-(MADD a (MOVDconst [c]) x) && c%9 == 0 && isPowerOfTwo(c/9) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [3]) [log2(c/9)])
+(MADD a (MOVDconst [c]) x) && isPowerOfTwo64(c) => (ADDshiftLL a x [log2(c)])
+(MADD a (MOVDconst [c]) x) && isPowerOfTwo64(c-1) && c>=3 => (ADD a (ADDshiftLL <x.Type> x x [log2(c-1)]))
+(MADD a (MOVDconst [c]) x) && isPowerOfTwo64(c+1) && c>=7 => (SUB a (SUBshiftLL <x.Type> x x [log2(c+1)]))
+(MADD a (MOVDconst [c]) x) && c%3 == 0 && isPowerOfTwo64(c/3) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [2]) [log2(c/3)])
+(MADD a (MOVDconst [c]) x) && c%5 == 0 && isPowerOfTwo64(c/5) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [2]) [log2(c/5)])
+(MADD a (MOVDconst [c]) x) && c%7 == 0 && isPowerOfTwo64(c/7) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [3]) [log2(c/7)])
+(MADD a (MOVDconst [c]) x) && c%9 == 0 && isPowerOfTwo64(c/9) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [3]) [log2(c/9)])
 
 (MADDW a x (MOVDconst [c])) && int32(c)==-1 => (SUB a x)
 (MADDW a _ (MOVDconst [c])) && int32(c)==0 => a
 (MADDW a x (MOVDconst [c])) && int32(c)==1 => (ADD a x)
-(MADDW a x (MOVDconst [c])) && isPowerOfTwo(c) => (ADDshiftLL a x [log2(c)])
-(MADDW a x (MOVDconst [c])) && isPowerOfTwo(c-1) && int32(c)>=3 => (ADD a (ADDshiftLL <x.Type> x x [log2(c-1)]))
-(MADDW a x (MOVDconst [c])) && isPowerOfTwo(c+1) && int32(c)>=7 => (SUB a (SUBshiftLL <x.Type> x x [log2(c+1)]))
-(MADDW a x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo(c/3) && is32Bit(c) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [2]) [log2(c/3)])
-(MADDW a x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo(c/5) && is32Bit(c) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [2]) [log2(c/5)])
-(MADDW a x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo(c/7) && is32Bit(c) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [3]) [log2(c/7)])
-(MADDW a x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo(c/9) && is32Bit(c) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [3]) [log2(c/9)])
+(MADDW a x (MOVDconst [c])) && isPowerOfTwo64(c) => (ADDshiftLL a x [log2(c)])
+(MADDW a x (MOVDconst [c])) && isPowerOfTwo64(c-1) && int32(c)>=3 => (ADD a (ADDshiftLL <x.Type> x x [log2(c-1)]))
+(MADDW a x (MOVDconst [c])) && isPowerOfTwo64(c+1) && int32(c)>=7 => (SUB a (SUBshiftLL <x.Type> x x [log2(c+1)]))
+(MADDW a x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) && is32Bit(c) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [2]) [log2(c/3)])
+(MADDW a x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) && is32Bit(c) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [2]) [log2(c/5)])
+(MADDW a x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) && is32Bit(c) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [3]) [log2(c/7)])
+(MADDW a x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) && is32Bit(c) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [3]) [log2(c/9)])
 
 (MADDW a (MOVDconst [c]) x) && int32(c)==-1 => (SUB a x)
 (MADDW a (MOVDconst [c]) _) && int32(c)==0 => a
 (MADDW a (MOVDconst [c]) x) && int32(c)==1 => (ADD a x)
-(MADDW a (MOVDconst [c]) x) && isPowerOfTwo(c) => (ADDshiftLL a x [log2(c)])
-(MADDW a (MOVDconst [c]) x) && isPowerOfTwo(c-1) && int32(c)>=3 => (ADD a (ADDshiftLL <x.Type> x x [log2(c-1)]))
-(MADDW a (MOVDconst [c]) x) && isPowerOfTwo(c+1) && int32(c)>=7 => (SUB a (SUBshiftLL <x.Type> x x [log2(c+1)]))
-(MADDW a (MOVDconst [c]) x) && c%3 == 0 && isPowerOfTwo(c/3) && is32Bit(c) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [2]) [log2(c/3)])
-(MADDW a (MOVDconst [c]) x) && c%5 == 0 && isPowerOfTwo(c/5) && is32Bit(c) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [2]) [log2(c/5)])
-(MADDW a (MOVDconst [c]) x) && c%7 == 0 && isPowerOfTwo(c/7) && is32Bit(c) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [3]) [log2(c/7)])
-(MADDW a (MOVDconst [c]) x) && c%9 == 0 && isPowerOfTwo(c/9) && is32Bit(c) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [3]) [log2(c/9)])
+(MADDW a (MOVDconst [c]) x) && isPowerOfTwo64(c) => (ADDshiftLL a x [log2(c)])
+(MADDW a (MOVDconst [c]) x) && isPowerOfTwo64(c-1) && int32(c)>=3 => (ADD a (ADDshiftLL <x.Type> x x [log2(c-1)]))
+(MADDW a (MOVDconst [c]) x) && isPowerOfTwo64(c+1) && int32(c)>=7 => (SUB a (SUBshiftLL <x.Type> x x [log2(c+1)]))
+(MADDW a (MOVDconst [c]) x) && c%3 == 0 && isPowerOfTwo64(c/3) && is32Bit(c) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [2]) [log2(c/3)])
+(MADDW a (MOVDconst [c]) x) && c%5 == 0 && isPowerOfTwo64(c/5) && is32Bit(c) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [2]) [log2(c/5)])
+(MADDW a (MOVDconst [c]) x) && c%7 == 0 && isPowerOfTwo64(c/7) && is32Bit(c) => (SUBshiftLL a (SUBshiftLL <x.Type> x x [3]) [log2(c/7)])
+(MADDW a (MOVDconst [c]) x) && c%9 == 0 && isPowerOfTwo64(c/9) && is32Bit(c) => (ADDshiftLL a (ADDshiftLL <x.Type> x x [3]) [log2(c/9)])
 
 (MSUB a x (MOVDconst [-1])) => (ADD a x)
 (MSUB a _ (MOVDconst [0])) => a
 (MSUB a x (MOVDconst [1])) => (SUB a x)
-(MSUB a x (MOVDconst [c])) && isPowerOfTwo(c) => (SUBshiftLL a x [log2(c)])
-(MSUB a x (MOVDconst [c])) && isPowerOfTwo(c-1) && c>=3 => (SUB a (ADDshiftLL <x.Type> x x [log2(c-1)]))
-(MSUB a x (MOVDconst [c])) && isPowerOfTwo(c+1) && c>=7 => (ADD a (SUBshiftLL <x.Type> x x [log2(c+1)]))
-(MSUB a x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo(c/3) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [2]) [log2(c/3)])
-(MSUB a x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo(c/5) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [2]) [log2(c/5)])
-(MSUB a x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo(c/7) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [3]) [log2(c/7)])
-(MSUB a x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo(c/9) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [3]) [log2(c/9)])
+(MSUB a x (MOVDconst [c])) && isPowerOfTwo64(c) => (SUBshiftLL a x [log2(c)])
+(MSUB a x (MOVDconst [c])) && isPowerOfTwo64(c-1) && c>=3 => (SUB a (ADDshiftLL <x.Type> x x [log2(c-1)]))
+(MSUB a x (MOVDconst [c])) && isPowerOfTwo64(c+1) && c>=7 => (ADD a (SUBshiftLL <x.Type> x x [log2(c+1)]))
+(MSUB a x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [2]) [log2(c/3)])
+(MSUB a x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [2]) [log2(c/5)])
+(MSUB a x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [3]) [log2(c/7)])
+(MSUB a x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [3]) [log2(c/9)])
 
 (MSUB a (MOVDconst [-1]) x) => (ADD a x)
 (MSUB a (MOVDconst [0]) _) => a
 (MSUB a (MOVDconst [1]) x) => (SUB a x)
-(MSUB a (MOVDconst [c]) x) && isPowerOfTwo(c) => (SUBshiftLL a x [log2(c)])
-(MSUB a (MOVDconst [c]) x) && isPowerOfTwo(c-1) && c>=3 => (SUB a (ADDshiftLL <x.Type> x x [log2(c-1)]))
-(MSUB a (MOVDconst [c]) x) && isPowerOfTwo(c+1) && c>=7 => (ADD a (SUBshiftLL <x.Type> x x [log2(c+1)]))
-(MSUB a (MOVDconst [c]) x) && c%3 == 0 && isPowerOfTwo(c/3) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [2]) [log2(c/3)])
-(MSUB a (MOVDconst [c]) x) && c%5 == 0 && isPowerOfTwo(c/5) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [2]) [log2(c/5)])
-(MSUB a (MOVDconst [c]) x) && c%7 == 0 && isPowerOfTwo(c/7) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [3]) [log2(c/7)])
-(MSUB a (MOVDconst [c]) x) && c%9 == 0 && isPowerOfTwo(c/9) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [3]) [log2(c/9)])
+(MSUB a (MOVDconst [c]) x) && isPowerOfTwo64(c) => (SUBshiftLL a x [log2(c)])
+(MSUB a (MOVDconst [c]) x) && isPowerOfTwo64(c-1) && c>=3 => (SUB a (ADDshiftLL <x.Type> x x [log2(c-1)]))
+(MSUB a (MOVDconst [c]) x) && isPowerOfTwo64(c+1) && c>=7 => (ADD a (SUBshiftLL <x.Type> x x [log2(c+1)]))
+(MSUB a (MOVDconst [c]) x) && c%3 == 0 && isPowerOfTwo64(c/3) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [2]) [log2(c/3)])
+(MSUB a (MOVDconst [c]) x) && c%5 == 0 && isPowerOfTwo64(c/5) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [2]) [log2(c/5)])
+(MSUB a (MOVDconst [c]) x) && c%7 == 0 && isPowerOfTwo64(c/7) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [3]) [log2(c/7)])
+(MSUB a (MOVDconst [c]) x) && c%9 == 0 && isPowerOfTwo64(c/9) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [3]) [log2(c/9)])
 
 (MSUBW a x (MOVDconst [c])) && int32(c)==-1 => (ADD a x)
 (MSUBW a _ (MOVDconst [c])) && int32(c)==0 => a
 (MSUBW a x (MOVDconst [c])) && int32(c)==1 => (SUB a x)
-(MSUBW a x (MOVDconst [c])) && isPowerOfTwo(c) => (SUBshiftLL a x [log2(c)])
-(MSUBW a x (MOVDconst [c])) && isPowerOfTwo(c-1) && int32(c)>=3 => (SUB a (ADDshiftLL <x.Type> x x [log2(c-1)]))
-(MSUBW a x (MOVDconst [c])) && isPowerOfTwo(c+1) && int32(c)>=7 => (ADD a (SUBshiftLL <x.Type> x x [log2(c+1)]))
-(MSUBW a x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo(c/3) && is32Bit(c) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [2]) [log2(c/3)])
-(MSUBW a x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo(c/5) && is32Bit(c) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [2]) [log2(c/5)])
-(MSUBW a x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo(c/7) && is32Bit(c) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [3]) [log2(c/7)])
-(MSUBW a x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo(c/9) && is32Bit(c) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [3]) [log2(c/9)])
+(MSUBW a x (MOVDconst [c])) && isPowerOfTwo64(c) => (SUBshiftLL a x [log2(c)])
+(MSUBW a x (MOVDconst [c])) && isPowerOfTwo64(c-1) && int32(c)>=3 => (SUB a (ADDshiftLL <x.Type> x x [log2(c-1)]))
+(MSUBW a x (MOVDconst [c])) && isPowerOfTwo64(c+1) && int32(c)>=7 => (ADD a (SUBshiftLL <x.Type> x x [log2(c+1)]))
+(MSUBW a x (MOVDconst [c])) && c%3 == 0 && isPowerOfTwo64(c/3) && is32Bit(c) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [2]) [log2(c/3)])
+(MSUBW a x (MOVDconst [c])) && c%5 == 0 && isPowerOfTwo64(c/5) && is32Bit(c) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [2]) [log2(c/5)])
+(MSUBW a x (MOVDconst [c])) && c%7 == 0 && isPowerOfTwo64(c/7) && is32Bit(c) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [3]) [log2(c/7)])
+(MSUBW a x (MOVDconst [c])) && c%9 == 0 && isPowerOfTwo64(c/9) && is32Bit(c) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [3]) [log2(c/9)])
 
 (MSUBW a (MOVDconst [c]) x) && int32(c)==-1 => (ADD a x)
 (MSUBW a (MOVDconst [c]) _) && int32(c)==0 => a
 (MSUBW a (MOVDconst [c]) x) && int32(c)==1 => (SUB a x)
-(MSUBW a (MOVDconst [c]) x) && isPowerOfTwo(c) => (SUBshiftLL a x [log2(c)])
-(MSUBW a (MOVDconst [c]) x) && isPowerOfTwo(c-1) && int32(c)>=3 => (SUB a (ADDshiftLL <x.Type> x x [log2(c-1)]))
-(MSUBW a (MOVDconst [c]) x) && isPowerOfTwo(c+1) && int32(c)>=7 => (ADD a (SUBshiftLL <x.Type> x x [log2(c+1)]))
-(MSUBW a (MOVDconst [c]) x) && c%3 == 0 && isPowerOfTwo(c/3) && is32Bit(c) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [2]) [log2(c/3)])
-(MSUBW a (MOVDconst [c]) x) && c%5 == 0 && isPowerOfTwo(c/5) && is32Bit(c) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [2]) [log2(c/5)])
-(MSUBW a (MOVDconst [c]) x) && c%7 == 0 && isPowerOfTwo(c/7) && is32Bit(c) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [3]) [log2(c/7)])
-(MSUBW a (MOVDconst [c]) x) && c%9 == 0 && isPowerOfTwo(c/9) && is32Bit(c) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [3]) [log2(c/9)])
+(MSUBW a (MOVDconst [c]) x) && isPowerOfTwo64(c) => (SUBshiftLL a x [log2(c)])
+(MSUBW a (MOVDconst [c]) x) && isPowerOfTwo64(c-1) && int32(c)>=3 => (SUB a (ADDshiftLL <x.Type> x x [log2(c-1)]))
+(MSUBW a (MOVDconst [c]) x) && isPowerOfTwo64(c+1) && int32(c)>=7 => (ADD a (SUBshiftLL <x.Type> x x [log2(c+1)]))
+(MSUBW a (MOVDconst [c]) x) && c%3 == 0 && isPowerOfTwo64(c/3) && is32Bit(c) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [2]) [log2(c/3)])
+(MSUBW a (MOVDconst [c]) x) && c%5 == 0 && isPowerOfTwo64(c/5) && is32Bit(c) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [2]) [log2(c/5)])
+(MSUBW a (MOVDconst [c]) x) && c%7 == 0 && isPowerOfTwo64(c/7) && is32Bit(c) => (ADDshiftLL a (SUBshiftLL <x.Type> x x [3]) [log2(c/7)])
+(MSUBW a (MOVDconst [c]) x) && c%9 == 0 && isPowerOfTwo64(c/9) && is32Bit(c) => (SUBshiftLL a (ADDshiftLL <x.Type> x x [3]) [log2(c/9)])
 
 // div by constant
 (UDIV x (MOVDconst [1])) => x
-(UDIV x (MOVDconst [c])) && isPowerOfTwo(c) => (SRLconst [log2(c)] x)
+(UDIV x (MOVDconst [c])) && isPowerOfTwo64(c) => (SRLconst [log2(c)] x)
 (UDIVW x (MOVDconst [c])) && uint32(c)==1 => x
-(UDIVW x (MOVDconst [c])) && isPowerOfTwo(c) && is32Bit(c) => (SRLconst [log2(c)] x)
+(UDIVW x (MOVDconst [c])) && isPowerOfTwo64(c) && is32Bit(c) => (SRLconst [log2(c)] x)
 (UMOD _ (MOVDconst [1])) => (MOVDconst [0])
-(UMOD x (MOVDconst [c])) && isPowerOfTwo(c) => (ANDconst [c-1] x)
+(UMOD x (MOVDconst [c])) && isPowerOfTwo64(c) => (ANDconst [c-1] x)
 (UMODW _ (MOVDconst [c])) && uint32(c)==1 => (MOVDconst [0])
-(UMODW x (MOVDconst [c])) && isPowerOfTwo(c) && is32Bit(c) => (ANDconst [c-1] x)
+(UMODW x (MOVDconst [c])) && isPowerOfTwo64(c) && is32Bit(c) => (ANDconst [c-1] x)
 
 // generic simplifications
 (ADD x (NEG y)) => (SUB x y)
diff --git a/src/cmd/compile/internal/ssa/gen/ARM64Ops.go b/src/cmd/compile/internal/ssa/gen/ARM64Ops.go
index 9ff53f7e4e..fe9edbf933 100644
--- a/src/cmd/compile/internal/ssa/gen/ARM64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/ARM64Ops.go
@@ -656,12 +656,14 @@ func init() {
 
 		// atomic and/or.
 		// *arg0 &= (|=) arg1. arg2=mem. returns <new content of *arg0, memory>. auxint must be zero.
-		// LDAXRB	(Rarg0), Rout
+		// LDAXR	(Rarg0), Rout
 		// AND/OR	Rarg1, Rout
-		// STLXRB	Rout, (Rarg0), Rtmp
+		// STLXR	Rout, (Rarg0), Rtmp
 		// CBNZ		Rtmp, -3(PC)
 		{name: "LoweredAtomicAnd8", argLength: 3, reg: gpxchg, resultNotInArgs: true, asm: "AND", typ: "(UInt8,Mem)", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+		{name: "LoweredAtomicAnd32", argLength: 3, reg: gpxchg, resultNotInArgs: true, asm: "AND", typ: "(UInt32,Mem)", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
 		{name: "LoweredAtomicOr8", argLength: 3, reg: gpxchg, resultNotInArgs: true, asm: "ORR", typ: "(UInt8,Mem)", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
+		{name: "LoweredAtomicOr32", argLength: 3, reg: gpxchg, resultNotInArgs: true, asm: "ORR", typ: "(UInt32,Mem)", faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
 
 		// LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
 		// It saves all GP registers if necessary,
diff --git a/src/cmd/compile/internal/ssa/gen/MIPS.rules b/src/cmd/compile/internal/ssa/gen/MIPS.rules
index 96feaf9234..b6e5312224 100644
--- a/src/cmd/compile/internal/ssa/gen/MIPS.rules
+++ b/src/cmd/compile/internal/ssa/gen/MIPS.rules
@@ -143,7 +143,7 @@
 (Const(32|16|8) [val]) => (MOVWconst [int32(val)])
 (Const(32|64)F ...) => (MOV(F|D)const ...)
 (ConstNil) => (MOVWconst [0])
-(ConstBool [b]) => (MOVWconst [int32(b2i(b))])
+(ConstBool [b]) => (MOVWconst [b2i32(b)])
 
 // truncations
 // Because we ignore high parts of registers, truncates are just copies.
@@ -383,6 +383,9 @@
 				(ANDconst <typ.UInt32> [3]
 					(XORconst <typ.UInt32> [3] ptr)))))) mem)
 
+(AtomicAnd32 ...) => (LoweredAtomicAnd ...)
+(AtomicOr32  ...) => (LoweredAtomicOr  ...)
+
 
 // checks
 (NilCheck ...) => (LoweredNilCheck ...)
@@ -581,13 +584,13 @@
 (Select0 (MULTU (MOVWconst [1])  _ )) => (MOVWconst [0])
 (Select1 (MULTU (MOVWconst [-1]) x )) => (NEG <x.Type> x)
 (Select0 (MULTU (MOVWconst [-1]) x )) => (CMOVZ (ADDconst <x.Type> [-1] x) (MOVWconst [0]) x)
-(Select1 (MULTU (MOVWconst [c])  x )) && isPowerOfTwo(int64(uint32(c))) => (SLLconst [int32(log2uint32(int64(c)))] x)
-(Select0 (MULTU (MOVWconst [c])  x )) && isPowerOfTwo(int64(uint32(c))) => (SRLconst [int32(32-log2uint32(int64(c)))] x)
+(Select1 (MULTU (MOVWconst [c])  x )) && isPowerOfTwo64(int64(uint32(c))) => (SLLconst [int32(log2uint32(int64(c)))] x)
+(Select0 (MULTU (MOVWconst [c])  x )) && isPowerOfTwo64(int64(uint32(c))) => (SRLconst [int32(32-log2uint32(int64(c)))] x)
 
 (MUL (MOVWconst [0])  _ ) => (MOVWconst [0])
 (MUL (MOVWconst [1])  x ) => x
 (MUL (MOVWconst [-1]) x ) => (NEG x)
-(MUL (MOVWconst [c]) x ) && isPowerOfTwo(int64(uint32(c))) => (SLLconst [int32(log2uint32(int64(c)))] x)
+(MUL (MOVWconst [c]) x ) && isPowerOfTwo64(int64(uint32(c))) => (SLLconst [int32(log2uint32(int64(c)))] x)
 
 // generic simplifications
 (ADD x (NEG y)) => (SUB x y)
diff --git a/src/cmd/compile/internal/ssa/gen/MIPS64.rules b/src/cmd/compile/internal/ssa/gen/MIPS64.rules
index e008ec8703..8e4c3a07c8 100644
--- a/src/cmd/compile/internal/ssa/gen/MIPS64.rules
+++ b/src/cmd/compile/internal/ssa/gen/MIPS64.rules
@@ -580,13 +580,13 @@
 (Select1 (MULVU x (MOVVconst [-1]))) => (NEGV x)
 (Select1 (MULVU _ (MOVVconst [0]))) => (MOVVconst [0])
 (Select1 (MULVU x (MOVVconst [1]))) => x
-(Select1 (MULVU x (MOVVconst [c]))) && isPowerOfTwo(c) => (SLLVconst [log2(c)] x)
+(Select1 (MULVU x (MOVVconst [c]))) && isPowerOfTwo64(c) => (SLLVconst [log2(c)] x)
 
 // div by constant
 (Select1 (DIVVU x (MOVVconst [1]))) => x
-(Select1 (DIVVU x (MOVVconst [c]))) && isPowerOfTwo(c) => (SRLVconst [log2(c)] x)
+(Select1 (DIVVU x (MOVVconst [c]))) && isPowerOfTwo64(c) => (SRLVconst [log2(c)] x)
 (Select0 (DIVVU _ (MOVVconst [1]))) => (MOVVconst [0])                       // mod
-(Select0 (DIVVU x (MOVVconst [c]))) && isPowerOfTwo(c) => (ANDconst [c-1] x) // mod
+(Select0 (DIVVU x (MOVVconst [c]))) && isPowerOfTwo64(c) => (ANDconst [c-1] x) // mod
 
 // generic simplifications
 (ADDV x (NEGV y)) => (SUBV x y)
diff --git a/src/cmd/compile/internal/ssa/gen/PPC64.rules b/src/cmd/compile/internal/ssa/gen/PPC64.rules
index de30d003e6..558b09c9f2 100644
--- a/src/cmd/compile/internal/ssa/gen/PPC64.rules
+++ b/src/cmd/compile/internal/ssa/gen/PPC64.rules
@@ -150,6 +150,31 @@
 (ROTLW  x (MOVDconst [c])) => (ROTLWconst  x [c&31])
 (ROTL   x (MOVDconst [c])) => (ROTLconst   x [c&63])
 
+// Combine rotate and mask operations
+(ANDconst [m] (ROTLWconst [r] x)) && isPPC64WordRotateMask(m) => (RLWINM [encodePPC64RotateMask(r,m,32)] x)
+(AND (MOVDconst [m]) (ROTLWconst [r] x)) && isPPC64WordRotateMask(m) => (RLWINM [encodePPC64RotateMask(r,m,32)] x)
+(ANDconst [m] (ROTLW x r)) && isPPC64WordRotateMask(m) => (RLWNM [encodePPC64RotateMask(0,m,32)] x r)
+(AND (MOVDconst [m]) (ROTLW x r)) && isPPC64WordRotateMask(m) => (RLWNM [encodePPC64RotateMask(0,m,32)] x r)
+
+// Note, any rotated word bitmask is still a valid word bitmask.
+(ROTLWconst [r] (AND (MOVDconst [m]) x)) && isPPC64WordRotateMask(m) => (RLWINM [encodePPC64RotateMask(r,rotateLeft32(m,r),32)] x)
+(ROTLWconst [r] (ANDconst [m] x)) && isPPC64WordRotateMask(m) => (RLWINM [encodePPC64RotateMask(r,rotateLeft32(m,r),32)] x)
+
+(ANDconst [m] (SRWconst x [s])) && mergePPC64RShiftMask(m,s,32) == 0 => (MOVDconst [0])
+(ANDconst [m] (SRWconst x [s])) && mergePPC64AndSrwi(m,s) != 0 => (RLWINM [mergePPC64AndSrwi(m,s)] x)
+(AND (MOVDconst [m]) (SRWconst x [s])) && mergePPC64RShiftMask(m,s,32) == 0 => (MOVDconst [0])
+(AND (MOVDconst [m]) (SRWconst x [s])) && mergePPC64AndSrwi(m,s) != 0 => (RLWINM [mergePPC64AndSrwi(m,s)] x)
+
+(SRWconst (ANDconst [m] x) [s]) && mergePPC64RShiftMask(m>>uint(s),s,32) == 0 => (MOVDconst [0])
+(SRWconst (ANDconst [m] x) [s]) && mergePPC64AndSrwi(m>>uint(s),s) != 0 => (RLWINM [mergePPC64AndSrwi(m>>uint(s),s)] x)
+(SRWconst (AND (MOVDconst [m]) x) [s]) && mergePPC64RShiftMask(m>>uint(s),s,32) == 0 => (MOVDconst [0])
+(SRWconst (AND (MOVDconst [m]) x) [s]) && mergePPC64AndSrwi(m>>uint(s),s) != 0 => (RLWINM [mergePPC64AndSrwi(m>>uint(s),s)] x)
+
+// Merge shift right + shift left and clear left (e.g for a table lookup)
+(CLRLSLDI [c] (SRWconst [s] x)) && mergePPC64ClrlsldiSrw(int64(c),s) != 0 => (RLWINM [mergePPC64ClrlsldiSrw(int64(c),s)] x)
+(SLDconst [l] (SRWconst [r] x)) && mergePPC64SldiSrw(l,r) != 0 => (RLWINM [mergePPC64SldiSrw(l,r)] x)
+// The following reduction shows up frequently too. e.g b[(x>>14)&0xFF]
+(CLRLSLDI [c] i:(RLWINM [s] x)) && mergePPC64ClrlsldiRlwinm(c,s) != 0 => (RLWINM [mergePPC64ClrlsldiRlwinm(c,s)] x)
 
 // large constant shifts
 (Lsh64x64  _ (MOVDconst [c])) && uint64(c) >= 64 => (MOVDconst [0])
@@ -821,6 +846,8 @@
 
 (ADDconst [c] (MOVDaddr [d] {sym} x)) && is32Bit(c+int64(d)) => (MOVDaddr [int32(c+int64(d))] {sym} x)
 
+(MULL(W|D) x (MOVDconst [c])) && is16Bit(c) => (MULL(W|D)const [int32(c)] x)
+
 // Subtract from (with carry, but ignored) constant.
 // Note, these clobber the carry bit.
 (SUB (MOVDconst [c]) x) && is32Bit(c) => (SUBFCconst [c] x)
@@ -965,10 +992,10 @@
 
 // atomic intrinsics
 (AtomicLoad(8|32|64|Ptr)  ptr mem) => (LoweredAtomicLoad(8|32|64|Ptr) [1] ptr mem)
-(AtomicLoadAcq32        ptr mem) => (LoweredAtomicLoad32 [0] ptr mem)
+(AtomicLoadAcq(32|64)     ptr mem) => (LoweredAtomicLoad(32|64) [0] ptr mem)
 
 (AtomicStore(8|32|64)    ptr val mem) => (LoweredAtomicStore(8|32|64) [1] ptr val mem)
-(AtomicStoreRel32        ptr val mem) => (LoweredAtomicStore32 [0] ptr val mem)
+(AtomicStoreRel(32|64)   ptr val mem) => (LoweredAtomicStore(32|64) [0] ptr val mem)
 //(AtomicStorePtrNoWB ptr val mem) => (STLR  ptr val mem)
 
 (AtomicExchange(32|64) ...) => (LoweredAtomicExchange(32|64) ...)
@@ -978,8 +1005,10 @@
 (AtomicCompareAndSwap(32|64) ptr old new_ mem) => (LoweredAtomicCas(32|64) [1] ptr old new_ mem)
 (AtomicCompareAndSwapRel32   ptr old new_ mem) => (LoweredAtomicCas32 [0] ptr old new_ mem)
 
-(AtomicAnd8 ...) => (LoweredAtomicAnd8 ...)
-(AtomicOr8  ...) => (LoweredAtomicOr8  ...)
+(AtomicAnd8  ...) => (LoweredAtomicAnd8  ...)
+(AtomicAnd32 ...) => (LoweredAtomicAnd32 ...)
+(AtomicOr8   ...) => (LoweredAtomicOr8   ...)
+(AtomicOr32  ...) => (LoweredAtomicOr32  ...)
 
 (Slicemask <t> x) => (SRADconst (NEG <t> x) [63])
 
@@ -1018,13 +1047,12 @@
 (SLDconst [c] z:(MOVHZreg x)) && c < 16 && z.Uses == 1 => (CLRLSLDI [newPPC64ShiftAuxInt(c,48,63,64)] x)
 (SLDconst [c] z:(MOVWZreg x)) && c < 32 && z.Uses == 1 => (CLRLSLDI [newPPC64ShiftAuxInt(c,32,63,64)] x)
 
-(SLDconst [c] z:(ANDconst [d] x)) && z.Uses == 1 && isPPC64ValidShiftMask(d) => (CLRLSLDI [newPPC64ShiftAuxInt(c,64-getPPC64ShiftMaskLength(d),63,64)] x)
-(SLDconst [c] z:(AND (MOVDconst [d]) x)) && z.Uses == 1 && isPPC64ValidShiftMask(d) => (CLRLSLDI [newPPC64ShiftAuxInt(c,64-getPPC64ShiftMaskLength(d),63,64)] x)
+(SLDconst [c] z:(ANDconst [d] x)) && z.Uses == 1 && isPPC64ValidShiftMask(d) && c <= (64-getPPC64ShiftMaskLength(d)) => (CLRLSLDI [newPPC64ShiftAuxInt(c,64-getPPC64ShiftMaskLength(d),63,64)] x)
+(SLDconst [c] z:(AND (MOVDconst [d]) x)) && z.Uses == 1 && isPPC64ValidShiftMask(d) && c<=(64-getPPC64ShiftMaskLength(d)) => (CLRLSLDI [newPPC64ShiftAuxInt(c,64-getPPC64ShiftMaskLength(d),63,64)] x)
 (SLWconst [c] z:(MOVBZreg x)) && z.Uses == 1 && c < 8 => (CLRLSLWI [newPPC64ShiftAuxInt(c,24,31,32)] x)
 (SLWconst [c] z:(MOVHZreg x)) && z.Uses == 1 && c < 16 => (CLRLSLWI [newPPC64ShiftAuxInt(c,16,31,32)] x)
-(SLWconst [c] z:(MOVWZreg x)) && z.Uses == 1 && c < 24 => (CLRLSLWI [newPPC64ShiftAuxInt(c,8,31,32)] x)
-(SLWconst [c] z:(ANDconst [d] x)) && z.Uses == 1 && isPPC64ValidShiftMask(d) => (CLRLSLWI [newPPC64ShiftAuxInt(c,32-getPPC64ShiftMaskLength(d),31,32)] x)
-(SLWconst [c] z:(AND (MOVDconst [d]) x)) && z.Uses == 1 && isPPC64ValidShiftMask(d) => (CLRLSLWI [newPPC64ShiftAuxInt(c,32-getPPC64ShiftMaskLength(d),31,32)] x)
+(SLWconst [c] z:(ANDconst [d] x)) && z.Uses == 1 && isPPC64ValidShiftMask(d) && c<=(32-getPPC64ShiftMaskLength(d)) => (CLRLSLWI [newPPC64ShiftAuxInt(c,32-getPPC64ShiftMaskLength(d),31,32)] x)
+(SLWconst [c] z:(AND (MOVDconst [d]) x)) && z.Uses == 1 && isPPC64ValidShiftMask(d) && c<=(32-getPPC64ShiftMaskLength(d)) => (CLRLSLWI [newPPC64ShiftAuxInt(c,32-getPPC64ShiftMaskLength(d),31,32)] x)
 // special case for power9
 (SL(W|D)const [c] z:(MOVWreg x)) && c < 32 && objabi.GOPPC64 >= 9 => (EXTSWSLconst [c] x)
 
diff --git a/src/cmd/compile/internal/ssa/gen/PPC64Ops.go b/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
index 28317928a8..f7198b90c3 100644
--- a/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/PPC64Ops.go
@@ -137,6 +137,7 @@ func init() {
 		gp01        = regInfo{inputs: nil, outputs: []regMask{gp}}
 		gp11        = regInfo{inputs: []regMask{gp | sp | sb}, outputs: []regMask{gp}}
 		gp21        = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb}, outputs: []regMask{gp}}
+		gp21a0      = regInfo{inputs: []regMask{gp, gp | sp | sb}, outputs: []regMask{gp}}
 		gp31        = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb, gp | sp | sb}, outputs: []regMask{gp}}
 		gp22        = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb}, outputs: []regMask{gp, gp}}
 		gp32        = regInfo{inputs: []regMask{gp | sp | sb, gp | sp | sb, gp | sp | sb}, outputs: []regMask{gp, gp}}
@@ -181,6 +182,8 @@ func init() {
 
 		{name: "MULLD", argLength: 2, reg: gp21, asm: "MULLD", typ: "Int64", commutative: true}, // arg0*arg1 (signed 64-bit)
 		{name: "MULLW", argLength: 2, reg: gp21, asm: "MULLW", typ: "Int32", commutative: true}, // arg0*arg1 (signed 32-bit)
+		{name: "MULLDconst", argLength: 1, reg: gp11, asm: "MULLD", aux: "Int32", typ: "Int64"}, // arg0*auxInt (signed 64-bit)
+		{name: "MULLWconst", argLength: 1, reg: gp11, asm: "MULLW", aux: "Int32", typ: "Int64"}, // arg0*auxInt (signed 64-bit)
 		{name: "MADDLD", argLength: 3, reg: gp31, asm: "MADDLD", typ: "Int64"},                  // (arg0*arg1)+arg2 (signed 64-bit)
 
 		{name: "MULHD", argLength: 2, reg: gp21, asm: "MULHD", commutative: true},   // (arg0 * arg1) >> 64, signed
@@ -225,6 +228,10 @@ func init() {
 		{name: "ROTLWconst", argLength: 1, reg: gp11, asm: "ROTLW", aux: "Int64"}, // uint32(arg0) rotate left by auxInt bits
 		{name: "EXTSWSLconst", argLength: 1, reg: gp11, asm: "EXTSWSLI", aux: "Int64"},
 
+		{name: "RLWINM", argLength: 1, reg: gp11, asm: "RLWNM", aux: "Int64"},                      // Rotate and mask by immediate "rlwinm". encodePPC64RotateMask describes aux
+		{name: "RLWNM", argLength: 2, reg: gp21, asm: "RLWNM", aux: "Int64"},                       // Rotate and mask by "rlwnm". encodePPC64RotateMask describes aux
+		{name: "RLWMI", argLength: 2, reg: gp21a0, asm: "RLWMI", aux: "Int64", resultInArg0: true}, // "rlwimi" similar aux encoding as above
+
 		{name: "CNTLZD", argLength: 1, reg: gp11, asm: "CNTLZD", clobberFlags: true}, // count leading zeros
 		{name: "CNTLZW", argLength: 1, reg: gp11, asm: "CNTLZW", clobberFlags: true}, // count leading zeros (32 bit)
 
@@ -600,25 +607,22 @@ func init() {
 		{name: "LoweredAtomicLoadPtr", argLength: 2, reg: gpload, typ: "Int64", aux: "Int64", clobberFlags: true, faultOnNilArg0: true},
 
 		// atomic add32, 64
-		// SYNC
+		// LWSYNC
 		// LDAR         (Rarg0), Rout
 		// ADD		Rarg1, Rout
 		// STDCCC       Rout, (Rarg0)
 		// BNE          -3(PC)
-		// ISYNC
 		// return new sum
-
 		{name: "LoweredAtomicAdd32", argLength: 3, reg: gpxchg, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true},
 		{name: "LoweredAtomicAdd64", argLength: 3, reg: gpxchg, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true},
 
 		// atomic exchange32, 64
-		// SYNC
+		// LWSYNC
 		// LDAR         (Rarg0), Rout
 		// STDCCC       Rarg1, (Rarg0)
 		// BNE          -2(PC)
 		// ISYNC
 		// return old val
-
 		{name: "LoweredAtomicExchange32", argLength: 3, reg: gpxchg, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true},
 		{name: "LoweredAtomicExchange64", argLength: 3, reg: gpxchg, resultNotInArgs: true, clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true},
 
@@ -641,15 +645,16 @@ func init() {
 		{name: "LoweredAtomicCas64", argLength: 4, reg: gpcas, resultNotInArgs: true, aux: "Int64", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true},
 		{name: "LoweredAtomicCas32", argLength: 4, reg: gpcas, resultNotInArgs: true, aux: "Int64", clobberFlags: true, faultOnNilArg0: true, hasSideEffects: true},
 
-		// atomic 8 and/or.
+		// atomic 8/32 and/or.
 		// *arg0 &= (|=) arg1. arg2=mem. returns memory. auxint must be zero.
-		// LBAR		(Rarg0), Rtmp
+		// LBAR/LWAT	(Rarg0), Rtmp
 		// AND/OR	Rarg1, Rtmp
-		// STBCCC	Rtmp, (Rarg0), Rtmp
+		// STBCCC/STWCCC Rtmp, (Rarg0), Rtmp
 		// BNE		Rtmp, -3(PC)
-
 		{name: "LoweredAtomicAnd8", argLength: 3, reg: gpstore, asm: "AND", faultOnNilArg0: true, hasSideEffects: true},
+		{name: "LoweredAtomicAnd32", argLength: 3, reg: gpstore, asm: "AND", faultOnNilArg0: true, hasSideEffects: true},
 		{name: "LoweredAtomicOr8", argLength: 3, reg: gpstore, asm: "OR", faultOnNilArg0: true, hasSideEffects: true},
+		{name: "LoweredAtomicOr32", argLength: 3, reg: gpstore, asm: "OR", faultOnNilArg0: true, hasSideEffects: true},
 
 		// LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
 		// It preserves R0 through R17 (except special registers R1, R2, R11, R12, R13), g, and its arguments R20 and R21,
diff --git a/src/cmd/compile/internal/ssa/gen/RISCV64.rules b/src/cmd/compile/internal/ssa/gen/RISCV64.rules
index 9437c8e9d4..325cbeb825 100644
--- a/src/cmd/compile/internal/ssa/gen/RISCV64.rules
+++ b/src/cmd/compile/internal/ssa/gen/RISCV64.rules
@@ -3,11 +3,7 @@
 // license that can be found in the LICENSE file.
 
 // Optimizations TODO:
-// * Somehow track when values are already zero/signed-extended, avoid re-extending.
 // * Use SLTI and SLTIU for comparisons to constants, instead of SLT/SLTU with constants in registers
-// * Find a more efficient way to do zero/sign extension than left+right shift.
-//   There are many other options (store then load-extend, LUI+ANDI for zero extend, special case 32->64, ...),
-//   but left+right shift is simple and uniform, and we don't have real hardware to do perf testing on anyway.
 // * Use the zero register instead of moving 0 into a register.
 // * Add rules to avoid generating a temp bool value for (If (SLT[U] ...) ...).
 // * Optimize left and right shift by simplifying SLTIU, Neg, and ADD for constants.
@@ -66,8 +62,8 @@
 (Mod32u ...) => (REMUW ...)
 (Mod16 x y [false])  => (REMW  (SignExt16to32 x) (SignExt16to32 y))
 (Mod16u x y) => (REMUW (ZeroExt16to32 x) (ZeroExt16to32 y))
-(Mod8 x y)   => (REMW  (SignExt8to32 x)  (SignExt8to32 y))
-(Mod8u x y)  => (REMUW (ZeroExt8to32 x)  (ZeroExt8to32 y))
+(Mod8 x y)   => (REMW  (SignExt8to32  x) (SignExt8to32  y))
+(Mod8u x y)  => (REMUW (ZeroExt8to32  x) (ZeroExt8to32  y))
 
 (And64 ...) => (AND ...)
 (And32 ...) => (AND ...)
@@ -98,25 +94,21 @@
 
 (Sqrt ...) => (FSQRTD ...)
 
-// Zero and sign extension
-// Shift left until the bits we want are at the top of the register.
-// Then logical/arithmetic shift right for zero/sign extend.
-// We always extend to 64 bits; there's no reason not to,
-// and optimization rules can then collapse some extensions.
-
-(SignExt8to16  <t> x) => (SRAI [56] (SLLI <t> [56] x))
-(SignExt8to32  <t> x) => (SRAI [56] (SLLI <t> [56] x))
-(SignExt8to64  <t> x) => (SRAI [56] (SLLI <t> [56] x))
-(SignExt16to32 <t> x) => (SRAI [48] (SLLI <t> [48] x))
-(SignExt16to64 <t> x) => (SRAI [48] (SLLI <t> [48] x))
-(SignExt32to64 <t> x) => (ADDIW [0] x)
-
-(ZeroExt8to16  <t> x) => (SRLI [56] (SLLI <t> [56] x))
-(ZeroExt8to32  <t> x) => (SRLI [56] (SLLI <t> [56] x))
-(ZeroExt8to64  <t> x) => (SRLI [56] (SLLI <t> [56] x))
-(ZeroExt16to32 <t> x) => (SRLI [48] (SLLI <t> [48] x))
-(ZeroExt16to64 <t> x) => (SRLI [48] (SLLI <t> [48] x))
-(ZeroExt32to64 <t> x) => (SRLI [32] (SLLI <t> [32] x))
+// Sign and zero extension.
+
+(SignExt8to16  ...) => (MOVBreg ...)
+(SignExt8to32  ...) => (MOVBreg ...)
+(SignExt8to64  ...) => (MOVBreg ...)
+(SignExt16to32 ...) => (MOVHreg ...)
+(SignExt16to64 ...) => (MOVHreg ...)
+(SignExt32to64 ...) => (MOVWreg ...)
+
+(ZeroExt8to16  ...) => (MOVBUreg ...)
+(ZeroExt8to32  ...) => (MOVBUreg ...)
+(ZeroExt8to64  ...) => (MOVBUreg ...)
+(ZeroExt16to32 ...) => (MOVHUreg ...)
+(ZeroExt16to64 ...) => (MOVHUreg ...)
+(ZeroExt32to64 ...) => (MOVWUreg ...)
 
 (Cvt32to32F ...) => (FCVTSW ...)
 (Cvt32to64F ...) => (FCVTDW ...)
@@ -261,16 +253,16 @@
 (EqPtr x y) => (SEQZ (SUB <x.Type> x y))
 (Eq64  x y) => (SEQZ (SUB <x.Type> x y))
 (Eq32  x y) => (SEQZ (SUBW <x.Type> x y))
-(Eq16  x y) => (SEQZ (ZeroExt16to64 (SUB <x.Type> x y)))
-(Eq8   x y) => (SEQZ (ZeroExt8to64  (SUB <x.Type> x y)))
+(Eq16  x y) => (SEQZ (SUB <x.Type> (ZeroExt16to64 x) (ZeroExt16to64 y)))
+(Eq8   x y) => (SEQZ (SUB <x.Type> (ZeroExt8to64  x) (ZeroExt8to64  y)))
 (Eq64F ...) => (FEQD ...)
 (Eq32F ...) => (FEQS ...)
 
 (NeqPtr x y) => (SNEZ (SUB <x.Type> x y))
 (Neq64  x y) => (SNEZ (SUB <x.Type> x y))
 (Neq32  x y) => (SNEZ (SUBW <x.Type> x y))
-(Neq16  x y) => (SNEZ (ZeroExt16to64 (SUB <x.Type> x y)))
-(Neq8   x y) => (SNEZ (ZeroExt8to64  (SUB <x.Type> x y)))
+(Neq16  x y) => (SNEZ (SUB <x.Type> (ZeroExt16to64 x) (ZeroExt16to64 y)))
+(Neq8   x y) => (SNEZ (SUB <x.Type> (ZeroExt8to64  x) (ZeroExt8to64  y)))
 (Neq64F ...) => (FNED ...)
 (Neq32F ...) => (FNES ...)
 
@@ -291,8 +283,8 @@
 (Store {t} ptr val mem) && t.Size() == 2 => (MOVHstore ptr val mem)
 (Store {t} ptr val mem) && t.Size() == 4 && !is32BitFloat(val.Type) => (MOVWstore ptr val mem)
 (Store {t} ptr val mem) && t.Size() == 8 && !is64BitFloat(val.Type) => (MOVDstore ptr val mem)
-(Store {t} ptr val mem) && t.Size() == 4 && is32BitFloat(val.Type) => (FMOVWstore ptr val mem)
-(Store {t} ptr val mem) && t.Size() == 8 && is64BitFloat(val.Type) => (FMOVDstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 4 &&  is32BitFloat(val.Type) => (FMOVWstore ptr val mem)
+(Store {t} ptr val mem) && t.Size() == 8 &&  is64BitFloat(val.Type) => (FMOVDstore ptr val mem)
 
 // We need to fold MOVaddr into the LD/MOVDstore ops so that the live variable analysis
 // knows what variables are being read/written by the ops.
@@ -368,6 +360,13 @@
 (Zero [4] ptr mem) => (MOVWstore ptr (MOVWconst) mem)
 (Zero [8] ptr mem) => (MOVDstore ptr (MOVDconst) mem)
 
+// Medium zeroing uses a Duff's device
+// 8 and 128 are magic constants, see runtime/mkduff.go
+(Zero [s] {t} ptr mem)
+	&& s%8 == 0 && s >= 16 && s <= 8*128
+	&& t.Alignment()%8 == 0 && !config.noDuffDevice =>
+	(DUFFZERO [8 * (128 - s/8)] ptr mem)
+
 // Generic zeroing uses a loop
 (Zero [s] {t} ptr mem) =>
 	(LoweredZero [t.Alignment()]
@@ -403,6 +402,13 @@
 (Move [4] dst src mem) => (MOVWstore dst (MOVWload src mem) mem)
 (Move [8] dst src mem) => (MOVDstore dst (MOVDload src mem) mem)
 
+// Medium move uses a Duff's device
+// 16 and 128 are magic constants, see runtime/mkduff.go
+(Move [s] {t} dst src mem)
+	&& s%8 == 0 && s >= 16 && s <= 8*128 && t.Alignment()%8 == 0
+	&& !config.noDuffDevice && logLargeCopy(v, s) =>
+	(DUFFCOPY [16 * (128 - s/8)] dst src mem)
+
 // Generic move uses a loop
 (Move [s] {t} dst src mem) && (s <= 16 || logLargeCopy(v, s)) =>
 	(LoweredMove [t.Alignment()]
@@ -501,6 +507,65 @@
 (MOVWstore [off] {sym} ptr (MOVWconst [0]) mem) => (MOVWstorezero [off] {sym} ptr mem)
 (MOVDstore [off] {sym} ptr (MOVDconst [0]) mem) => (MOVDstorezero [off] {sym} ptr mem)
 
+// Avoid sign/zero extension after properly typed load.
+(MOVBreg  x:(MOVBload  _ _)) => (MOVDreg x)
+(MOVHreg  x:(MOVBload  _ _)) => (MOVDreg x)
+(MOVHreg  x:(MOVBUload _ _)) => (MOVDreg x)
+(MOVHreg  x:(MOVHload  _ _)) => (MOVDreg x)
+(MOVWreg  x:(MOVBload  _ _)) => (MOVDreg x)
+(MOVWreg  x:(MOVBUload _ _)) => (MOVDreg x)
+(MOVWreg  x:(MOVHload  _ _)) => (MOVDreg x)
+(MOVWreg  x:(MOVHUload _ _)) => (MOVDreg x)
+(MOVWreg  x:(MOVWload  _ _)) => (MOVDreg x)
+(MOVBUreg x:(MOVBUload _ _)) => (MOVDreg x)
+(MOVHUreg x:(MOVBUload _ _)) => (MOVDreg x)
+(MOVHUreg x:(MOVHUload _ _)) => (MOVDreg x)
+(MOVWUreg x:(MOVBUload _ _)) => (MOVDreg x)
+(MOVWUreg x:(MOVHUload _ _)) => (MOVDreg x)
+(MOVWUreg x:(MOVWUload _ _)) => (MOVDreg x)
+
+// Fold double extensions.
+(MOVBreg  x:(MOVBreg  _)) => (MOVDreg x)
+(MOVHreg  x:(MOVBreg  _)) => (MOVDreg x)
+(MOVHreg  x:(MOVBUreg _)) => (MOVDreg x)
+(MOVHreg  x:(MOVHreg  _)) => (MOVDreg x)
+(MOVWreg  x:(MOVBreg  _)) => (MOVDreg x)
+(MOVWreg  x:(MOVBUreg _)) => (MOVDreg x)
+(MOVWreg  x:(MOVHreg  _)) => (MOVDreg x)
+(MOVWreg  x:(MOVWreg  _)) => (MOVDreg x)
+(MOVBUreg x:(MOVBUreg _)) => (MOVDreg x)
+(MOVHUreg x:(MOVBUreg _)) => (MOVDreg x)
+(MOVHUreg x:(MOVHUreg _)) => (MOVDreg x)
+(MOVWUreg x:(MOVBUreg _)) => (MOVDreg x)
+(MOVWUreg x:(MOVHUreg _)) => (MOVDreg x)
+(MOVWUreg x:(MOVWUreg _)) => (MOVDreg x)
+
+// Do not extend before store.
+(MOVBstore [off] {sym} ptr (MOVBreg  x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVHreg  x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVWreg  x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVBUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVBstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVBstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHreg  x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVWreg  x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVHUreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVHstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVHstore [off] {sym} ptr x mem)
+(MOVWstore [off] {sym} ptr (MOVWreg  x) mem) => (MOVWstore [off] {sym} ptr x mem)
+(MOVWstore [off] {sym} ptr (MOVWUreg x) mem) => (MOVWstore [off] {sym} ptr x mem)
+
+// Replace extend after load with alternate load where possible.
+(MOVBreg  <t> x:(MOVBUload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBload  <t> [off] {sym} ptr mem)
+(MOVHreg  <t> x:(MOVHUload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVHload  <t> [off] {sym} ptr mem)
+(MOVWreg  <t> x:(MOVWUload [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWload  <t> [off] {sym} ptr mem)
+(MOVBUreg <t> x:(MOVBload  [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVBUload <t> [off] {sym} ptr mem)
+(MOVHUreg <t> x:(MOVHload  [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVHUload <t> [off] {sym} ptr mem)
+(MOVWUreg <t> x:(MOVWload  [off] {sym} ptr mem)) && x.Uses == 1 && clobber(x) => @x.Block (MOVWUload <t> [off] {sym} ptr mem)
+
+// If a register move has only 1 use, just use the same register without emitting instruction
+// MOVnop does not emit an instruction, only for ensuring the type.
+(MOVDreg x) && x.Uses == 1 => (MOVDnop x)
+
 // Fold constant into immediate instructions where possible.
 (ADD (MOVBconst [val]) x) => (ADDI [int64(val)] x)
 (ADD (MOVHconst [val]) x) => (ADDI [int64(val)] x)
diff --git a/src/cmd/compile/internal/ssa/gen/RISCV64Ops.go b/src/cmd/compile/internal/ssa/gen/RISCV64Ops.go
index b06b86075e..f64319230b 100644
--- a/src/cmd/compile/internal/ssa/gen/RISCV64Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/RISCV64Ops.go
@@ -24,10 +24,11 @@ import (
 // L               = 64 bit int, used when the opcode starts with F
 
 const (
-	riscv64REG_G    = 4
+	riscv64REG_G    = 27
 	riscv64REG_CTXT = 20
 	riscv64REG_LR   = 1
 	riscv64REG_SP   = 2
+	riscv64REG_TP   = 4
 	riscv64REG_TMP  = 31
 	riscv64REG_ZERO = 0
 )
@@ -78,8 +79,8 @@ func init() {
 
 		// Add general purpose registers to gpMask.
 		switch r {
-		// ZERO, and TMP are not in any gp mask.
-		case riscv64REG_ZERO, riscv64REG_TMP:
+		// ZERO, TP and TMP are not in any gp mask.
+		case riscv64REG_ZERO, riscv64REG_TP, riscv64REG_TMP:
 		case riscv64REG_G:
 			gpgMask |= mask
 			gpspsbgMask |= mask
@@ -192,6 +193,17 @@ func init() {
 		{name: "MOVWstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // 32 bits
 		{name: "MOVDstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOV", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"},  // 64 bits
 
+		// Conversions
+		{name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVB"},   // move from arg0, sign-extended from byte
+		{name: "MOVHreg", argLength: 1, reg: gp11, asm: "MOVH"},   // move from arg0, sign-extended from half
+		{name: "MOVWreg", argLength: 1, reg: gp11, asm: "MOVW"},   // move from arg0, sign-extended from word
+		{name: "MOVDreg", argLength: 1, reg: gp11, asm: "MOV"},    // move from arg0
+		{name: "MOVBUreg", argLength: 1, reg: gp11, asm: "MOVBU"}, // move from arg0, unsign-extended from byte
+		{name: "MOVHUreg", argLength: 1, reg: gp11, asm: "MOVHU"}, // move from arg0, unsign-extended from half
+		{name: "MOVWUreg", argLength: 1, reg: gp11, asm: "MOVWU"}, // move from arg0, unsign-extended from word
+
+		{name: "MOVDnop", argLength: 1, reg: regInfo{inputs: []regMask{gpMask}, outputs: []regMask{gpMask}}, resultInArg0: true}, // nop, return arg0 in same register
+
 		// Shift ops
 		{name: "SLL", argLength: 2, reg: gp21, asm: "SLL"},                 // arg0 << (aux1 & 63)
 		{name: "SRA", argLength: 2, reg: gp21, asm: "SRA"},                 // arg0 >> (aux1 & 63), signed
@@ -228,6 +240,44 @@ func init() {
 		{name: "CALLclosure", argLength: 3, reg: callClosure, aux: "CallOff", call: true}, // call function via closure. arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
 		{name: "CALLinter", argLength: 2, reg: callInter, aux: "CallOff", call: true},     // call fn by pointer. arg0=codeptr, arg1=mem, auxint=argsize, returns mem
 
+		// duffzero
+		// arg0 = address of memory to zero (in X10, changed as side effect)
+		// arg1 = mem
+		// auxint = offset into duffzero code to start executing
+		// X1 (link register) changed because of function call
+		// returns mem
+		{
+			name:      "DUFFZERO",
+			aux:       "Int64",
+			argLength: 2,
+			reg: regInfo{
+				inputs:   []regMask{regNamed["X10"]},
+				clobbers: regNamed["X1"] | regNamed["X10"],
+			},
+			typ:            "Mem",
+			faultOnNilArg0: true,
+		},
+
+		// duffcopy
+		// arg0 = address of dst memory (in X11, changed as side effect)
+		// arg1 = address of src memory (in X10, changed as side effect)
+		// arg2 = mem
+		// auxint = offset into duffcopy code to start executing
+		// X1 (link register) changed because of function call
+		// returns mem
+		{
+			name:      "DUFFCOPY",
+			aux:       "Int64",
+			argLength: 3,
+			reg: regInfo{
+				inputs:   []regMask{regNamed["X11"], regNamed["X10"]},
+				clobbers: regNamed["X1"] | regNamed["X10"] | regNamed["X11"],
+			},
+			typ:            "Mem",
+			faultOnNilArg0: true,
+			faultOnNilArg1: true,
+		},
+
 		// Generic moves and zeros
 
 		// general unaligned zeroing
diff --git a/src/cmd/compile/internal/ssa/gen/S390X.rules b/src/cmd/compile/internal/ssa/gen/S390X.rules
index e564f638d3..2d6f091a4e 100644
--- a/src/cmd/compile/internal/ssa/gen/S390X.rules
+++ b/src/cmd/compile/internal/ssa/gen/S390X.rules
@@ -198,6 +198,9 @@
          (RXSBG <typ.UInt32> {s390x.NewRotateParams(59, 60, 3)} (MOVDconst [3<<3]) ptr))
        mem)
 
+(AtomicAnd32 ...) => (LAN ...)
+(AtomicOr32  ...) => (LAO ...)
+
 // Lowering extension
 // Note: we always extend to 64 bits even though some ops don't need that many result bits.
 (SignExt8to(16|32|64) ...) => (MOVBreg ...)
diff --git a/src/cmd/compile/internal/ssa/gen/S390XOps.go b/src/cmd/compile/internal/ssa/gen/S390XOps.go
index 417b33cf91..728cfb5508 100644
--- a/src/cmd/compile/internal/ssa/gen/S390XOps.go
+++ b/src/cmd/compile/internal/ssa/gen/S390XOps.go
@@ -547,8 +547,10 @@ func init() {
 		// Atomic bitwise operations.
 		// Note: 'floor' operations round the pointer down to the nearest word boundary
 		// which reflects how they are used in the runtime.
-		{name: "LAOfloor", argLength: 3, reg: gpstorelab, asm: "LAO", typ: "Mem", clobberFlags: true, hasSideEffects: true}, // *(floor(arg0, 4)) |= arg1. arg2 = mem.
+		{name: "LAN", argLength: 3, reg: gpstore, asm: "LAN", typ: "Mem", clobberFlags: true, hasSideEffects: true}, // *arg0 &= arg1. arg2 = mem.
 		{name: "LANfloor", argLength: 3, reg: gpstorelab, asm: "LAN", typ: "Mem", clobberFlags: true, hasSideEffects: true}, // *(floor(arg0, 4)) &= arg1. arg2 = mem.
+		{name: "LAO", argLength: 3, reg: gpstore, asm: "LAO", typ: "Mem", clobberFlags: true, hasSideEffects: true}, // *arg0 |= arg1. arg2 = mem.
+		{name: "LAOfloor", argLength: 3, reg: gpstorelab, asm: "LAO", typ: "Mem", clobberFlags: true, hasSideEffects: true}, // *(floor(arg0, 4)) |= arg1. arg2 = mem.
 
 		// Compare and swap.
 		// arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory.
diff --git a/src/cmd/compile/internal/ssa/gen/dec64.rules b/src/cmd/compile/internal/ssa/gen/dec64.rules
index 4f9e863f90..07607960fa 100644
--- a/src/cmd/compile/internal/ssa/gen/dec64.rules
+++ b/src/cmd/compile/internal/ssa/gen/dec64.rules
@@ -9,7 +9,6 @@
 (Int64Hi (Int64Make hi _)) => hi
 (Int64Lo (Int64Make _ lo)) => lo
 
-
 (Load <t> ptr mem) && is64BitInt(t) && !config.BigEndian && t.IsSigned() =>
 	(Int64Make
 		(Load <typ.Int32> (OffPtr <typ.Int32Ptr> [4] ptr) mem)
@@ -143,6 +142,10 @@
 (Trunc64to32 (Int64Make _ lo)) => lo
 (Trunc64to16 (Int64Make _ lo)) => (Trunc32to16 lo)
 (Trunc64to8 (Int64Make _ lo)) => (Trunc32to8 lo)
+// Most general
+(Trunc64to32 x) => (Int64Lo x)
+(Trunc64to16 x) => (Trunc32to16 (Int64Lo x))
+(Trunc64to8 x) => (Trunc32to8 (Int64Lo x))
 
 (Lsh32x64 _ (Int64Make (Const32 [c]) _)) && c != 0 => (Const32 [0])
 (Rsh32x64 x (Int64Make (Const32 [c]) _)) && c != 0 => (Signmask x)
@@ -175,156 +178,174 @@
 // turn x64 non-constant shifts to x32 shifts
 // if high 32-bit of the shift is nonzero, make a huge shift
 (Lsh64x64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
-	(Lsh64x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+       (Lsh64x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
 (Rsh64x64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
-	(Rsh64x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+       (Rsh64x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
 (Rsh64Ux64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
-	(Rsh64Ux32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+       (Rsh64Ux32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
 (Lsh32x64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
-	(Lsh32x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+       (Lsh32x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
 (Rsh32x64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
-	(Rsh32x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+       (Rsh32x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
 (Rsh32Ux64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
-	(Rsh32Ux32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+       (Rsh32Ux32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
 (Lsh16x64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
-	(Lsh16x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+       (Lsh16x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
 (Rsh16x64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
-	(Rsh16x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+       (Rsh16x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
 (Rsh16Ux64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
-	(Rsh16Ux32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+       (Rsh16Ux32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
 (Lsh8x64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
-	(Lsh8x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+       (Lsh8x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
 (Rsh8x64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
-	(Rsh8x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+       (Rsh8x32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
 (Rsh8Ux64 x (Int64Make hi lo)) && hi.Op != OpConst32 =>
-	(Rsh8Ux32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+       (Rsh8Ux32 x (Or32 <typ.UInt32> (Zeromask hi) lo))
+
+// Most general
+(Lsh64x64 x y)  => (Lsh64x32  x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Rsh64x64 x y)  => (Rsh64x32  x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Rsh64Ux64 x y) => (Rsh64Ux32 x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Lsh32x64 x y)  => (Lsh32x32  x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Rsh32x64 x y)  => (Rsh32x32  x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Rsh32Ux64 x y) => (Rsh32Ux32 x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Lsh16x64 x y)  => (Lsh16x32  x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Rsh16x64 x y)  => (Rsh16x32  x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Rsh16Ux64 x y) => (Rsh16Ux32 x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Lsh8x64 x y)   => (Lsh8x32   x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Rsh8x64 x y)   => (Rsh8x32   x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+(Rsh8Ux64 x y)  => (Rsh8Ux32  x (Or32 <typ.UInt32> (Zeromask (Int64Hi y)) (Int64Lo y)))
+
+// Clean up constants a little
+(Or32 <typ.UInt32> (Zeromask (Const32 [c])) y) && c == 0 => y
+(Or32 <typ.UInt32> (Zeromask (Const32 [c])) y) && c != 0 => (Const32 <typ.UInt32> [-1])
 
 // 64x left shift
 // result.hi = hi<<s | lo>>(32-s) | lo<<(s-32) // >> is unsigned, large shifts result 0
 // result.lo = lo<<s
-(Lsh64x32 (Int64Make hi lo) s) =>
+(Lsh64x32 x s) =>
 	(Int64Make
 		(Or32 <typ.UInt32>
 			(Or32 <typ.UInt32>
-				(Lsh32x32 <typ.UInt32> hi s)
+				(Lsh32x32 <typ.UInt32> (Int64Hi x) s)
 				(Rsh32Ux32 <typ.UInt32>
-					lo
+					(Int64Lo x)
 					(Sub32 <typ.UInt32> (Const32 <typ.UInt32> [32]) s)))
 			(Lsh32x32 <typ.UInt32>
-				lo
+				(Int64Lo x)
 				(Sub32 <typ.UInt32> s (Const32 <typ.UInt32> [32]))))
-		(Lsh32x32 <typ.UInt32> lo s))
-(Lsh64x16 (Int64Make hi lo) s) =>
+		(Lsh32x32 <typ.UInt32> (Int64Lo x) s))
+(Lsh64x16 x s) =>
 	(Int64Make
 		(Or32 <typ.UInt32>
 			(Or32 <typ.UInt32>
-				(Lsh32x16 <typ.UInt32> hi s)
+				(Lsh32x16 <typ.UInt32> (Int64Hi x) s)
 				(Rsh32Ux16 <typ.UInt32>
-					lo
+					(Int64Lo x)
 					(Sub16 <typ.UInt16> (Const16 <typ.UInt16> [32]) s)))
 			(Lsh32x16 <typ.UInt32>
-				lo
+				(Int64Lo x)
 				(Sub16 <typ.UInt16> s (Const16 <typ.UInt16> [32]))))
-		(Lsh32x16 <typ.UInt32> lo s))
-(Lsh64x8 (Int64Make hi lo) s) =>
+		(Lsh32x16 <typ.UInt32> (Int64Lo x) s))
+(Lsh64x8 x s) =>
 	(Int64Make
 		(Or32 <typ.UInt32>
 			(Or32 <typ.UInt32>
-				(Lsh32x8 <typ.UInt32> hi s)
+				(Lsh32x8 <typ.UInt32> (Int64Hi x) s)
 				(Rsh32Ux8 <typ.UInt32>
-					lo
+					(Int64Lo x)
 					(Sub8 <typ.UInt8> (Const8 <typ.UInt8> [32]) s)))
 			(Lsh32x8 <typ.UInt32>
-				lo
+				(Int64Lo x)
 				(Sub8 <typ.UInt8> s (Const8 <typ.UInt8> [32]))))
-		(Lsh32x8 <typ.UInt32> lo s))
+		(Lsh32x8 <typ.UInt32> (Int64Lo x) s))
 
 // 64x unsigned right shift
 // result.hi = hi>>s
 // result.lo = lo>>s | hi<<(32-s) | hi>>(s-32) // >> is unsigned, large shifts result 0
-(Rsh64Ux32 (Int64Make hi lo) s) =>
+(Rsh64Ux32 x s) =>
 	(Int64Make
-		(Rsh32Ux32 <typ.UInt32> hi s)
+		(Rsh32Ux32 <typ.UInt32> (Int64Hi x) s)
 		(Or32 <typ.UInt32>
 			(Or32 <typ.UInt32>
-				(Rsh32Ux32 <typ.UInt32> lo s)
+				(Rsh32Ux32 <typ.UInt32> (Int64Lo x) s)
 				(Lsh32x32 <typ.UInt32>
-					hi
+					(Int64Hi x)
 					(Sub32 <typ.UInt32> (Const32 <typ.UInt32> [32]) s)))
 			(Rsh32Ux32 <typ.UInt32>
-				hi
+				(Int64Hi x)
 				(Sub32 <typ.UInt32> s (Const32 <typ.UInt32> [32])))))
-(Rsh64Ux16 (Int64Make hi lo) s) =>
+(Rsh64Ux16 x s) =>
 	(Int64Make
-		(Rsh32Ux16 <typ.UInt32> hi s)
+		(Rsh32Ux16 <typ.UInt32> (Int64Hi x) s)
 		(Or32 <typ.UInt32>
 			(Or32 <typ.UInt32>
-				(Rsh32Ux16 <typ.UInt32> lo s)
+				(Rsh32Ux16 <typ.UInt32> (Int64Lo x) s)
 				(Lsh32x16 <typ.UInt32>
-					hi
+					(Int64Hi x)
 					(Sub16 <typ.UInt16> (Const16 <typ.UInt16> [32]) s)))
 			(Rsh32Ux16 <typ.UInt32>
-				hi
+				(Int64Hi x)
 				(Sub16 <typ.UInt16> s (Const16 <typ.UInt16> [32])))))
-(Rsh64Ux8 (Int64Make hi lo) s) =>
+(Rsh64Ux8 x s) =>
 	(Int64Make
-		(Rsh32Ux8 <typ.UInt32> hi s)
+		(Rsh32Ux8 <typ.UInt32> (Int64Hi x) s)
 		(Or32 <typ.UInt32>
 			(Or32 <typ.UInt32>
-				(Rsh32Ux8 <typ.UInt32> lo s)
+				(Rsh32Ux8 <typ.UInt32> (Int64Lo x) s)
 				(Lsh32x8 <typ.UInt32>
-					hi
+					(Int64Hi x)
 					(Sub8 <typ.UInt8> (Const8 <typ.UInt8> [32]) s)))
 			(Rsh32Ux8 <typ.UInt32>
-				hi
+				(Int64Hi x)
 				(Sub8 <typ.UInt8> s (Const8 <typ.UInt8> [32])))))
 
 // 64x signed right shift
 // result.hi = hi>>s
 // result.lo = lo>>s | hi<<(32-s) | (hi>>(s-32))&zeromask(s>>5) // hi>>(s-32) is signed, large shifts result 0/-1
-(Rsh64x32 (Int64Make hi lo) s) =>
+(Rsh64x32 x s) =>
 	(Int64Make
-		(Rsh32x32 <typ.UInt32> hi s)
+		(Rsh32x32 <typ.UInt32> (Int64Hi x) s)
 		(Or32 <typ.UInt32>
 			(Or32 <typ.UInt32>
-				(Rsh32Ux32 <typ.UInt32> lo s)
+				(Rsh32Ux32 <typ.UInt32> (Int64Lo x) s)
 				(Lsh32x32 <typ.UInt32>
-					hi
+					(Int64Hi x)
 					(Sub32 <typ.UInt32> (Const32 <typ.UInt32> [32]) s)))
 			(And32 <typ.UInt32>
 				(Rsh32x32 <typ.UInt32>
-					hi
+					(Int64Hi x)
 					(Sub32 <typ.UInt32> s (Const32 <typ.UInt32> [32])))
 				(Zeromask
 					(Rsh32Ux32 <typ.UInt32> s (Const32 <typ.UInt32> [5]))))))
-(Rsh64x16 (Int64Make hi lo) s) =>
+(Rsh64x16 x s) =>
 	(Int64Make
-		(Rsh32x16 <typ.UInt32> hi s)
+		(Rsh32x16 <typ.UInt32> (Int64Hi x) s)
 		(Or32 <typ.UInt32>
 			(Or32 <typ.UInt32>
-				(Rsh32Ux16 <typ.UInt32> lo s)
+				(Rsh32Ux16 <typ.UInt32> (Int64Lo x) s)
 				(Lsh32x16 <typ.UInt32>
-					hi
+					(Int64Hi x)
 					(Sub16 <typ.UInt16> (Const16 <typ.UInt16> [32]) s)))
 			(And32 <typ.UInt32>
 				(Rsh32x16 <typ.UInt32>
-					hi
+					(Int64Hi x)
 					(Sub16 <typ.UInt16> s (Const16 <typ.UInt16> [32])))
 				(Zeromask
 					(ZeroExt16to32
 						(Rsh16Ux32 <typ.UInt16> s (Const32 <typ.UInt32> [5])))))))
-(Rsh64x8 (Int64Make hi lo) s) =>
+(Rsh64x8 x s) =>
 	(Int64Make
-		(Rsh32x8 <typ.UInt32> hi s)
+		(Rsh32x8 <typ.UInt32> (Int64Hi x) s)
 		(Or32 <typ.UInt32>
 			(Or32 <typ.UInt32>
-				(Rsh32Ux8 <typ.UInt32> lo s)
+				(Rsh32Ux8 <typ.UInt32> (Int64Lo x) s)
 				(Lsh32x8 <typ.UInt32>
-					hi
+					(Int64Hi x)
 					(Sub8 <typ.UInt8> (Const8 <typ.UInt8> [32]) s)))
 			(And32 <typ.UInt32>
 				(Rsh32x8 <typ.UInt32>
-					hi
+					(Int64Hi x)
 					(Sub8 <typ.UInt8> s (Const8 <typ.UInt8> [32])))
 				(Zeromask
 					(ZeroExt8to32
diff --git a/src/cmd/compile/internal/ssa/gen/generic.rules b/src/cmd/compile/internal/ssa/gen/generic.rules
index 39f8cc8889..4351ef5bdd 100644
--- a/src/cmd/compile/internal/ssa/gen/generic.rules
+++ b/src/cmd/compile/internal/ssa/gen/generic.rules
@@ -1961,6 +1961,31 @@
 	&& warnRule(fe.Debug_checknil(), v, "removed nil check")
 	=> (Invalid)
 
+// for late-expanded calls
+(Zero (SelectN [0] call:(StaticLECall _ _)) mem:(SelectN [1] call))
+	&& isSameCall(call.Aux, "runtime.newobject")
+	=> mem
+
+(Store (SelectN [0] call:(StaticLECall _ _)) x mem:(SelectN [1] call))
+	&& isConstZero(x)
+	&& isSameCall(call.Aux, "runtime.newobject")
+	=> mem
+
+(Store (OffPtr (SelectN [0] call:(StaticLECall _ _))) x mem:(SelectN [1] call))
+	&& isConstZero(x)
+	&& isSameCall(call.Aux, "runtime.newobject")
+	=> mem
+
+(NilCheck (SelectN [0] call:(StaticLECall _ _)) (SelectN [1] call))
+	&& isSameCall(call.Aux, "runtime.newobject")
+	&& warnRule(fe.Debug_checknil(), v, "removed nil check")
+	=> (Invalid)
+
+(NilCheck (OffPtr (SelectN [0] call:(StaticLECall _ _))) (SelectN [1] call))
+	&& isSameCall(call.Aux, "runtime.newobject")
+	&& warnRule(fe.Debug_checknil(), v, "removed nil check")
+	=> (Invalid)
+
 // Evaluate constant address comparisons.
 (EqPtr  x x) => (ConstBool [true])
 (NeqPtr x x) => (ConstBool [false])
@@ -2017,6 +2042,17 @@
 	&& clobber(s1, s2, s3)
 	=> (Move {t.Elem()} [int64(sz)] dst src mem)
 
+// Inline small or disjoint runtime.memmove calls with constant length.
+// See the comment in op Move in genericOps.go for discussion of the type.
+(SelectN [0] call:(StaticLECall {sym} dst src (Const(64|32) [sz]) mem))
+	&& sz >= 0
+	&& call.Uses == 1 // this will exclude all calls with results
+	&& isSameCall(sym, "runtime.memmove")
+	&& dst.Type.IsPtr() // avoids TUINTPTR, see issue 30061
+	&& isInlinableMemmove(dst, src, int64(sz), config)
+	&& clobber(call)
+	=> (Move {dst.Type.Elem()} [int64(sz)] dst src mem)
+
 // De-virtualize interface calls into static calls.
 // Note that (ITab (IMake)) doesn't get
 // rewritten until after the first opt pass,
@@ -2024,6 +2060,13 @@
 (InterCall [argsize] {auxCall} (Load (OffPtr [off] (ITab (IMake (Addr {itab} (SB)) _))) _) mem) && devirt(v, auxCall, itab, off) != nil =>
 	(StaticCall [int32(argsize)] {devirt(v, auxCall, itab, off)} mem)
 
+// De-virtualize late-expanded interface calls into late-expanded static calls.
+// Note that (ITab (IMake)) doesn't get rewritten until after the first opt pass,
+// so this rule should trigger reliably.
+// devirtLECall removes the first argument, adds the devirtualized symbol to the AuxCall, and changes the opcode
+(InterLECall [argsize] {auxCall} (Load (OffPtr [off] (ITab (IMake (Addr {itab} (SB)) _))) _) ___) && devirtLESym(v, auxCall, itab, off) !=
+    nil => devirtLECall(v, devirtLESym(v, auxCall, itab, off))
+
 // Move and Zero optimizations.
 // Move source and destination may overlap.
 
@@ -2404,6 +2447,7 @@
 				(Store {t5} (OffPtr <tt5> [o5] dst) d4
 					(Zero {t1} [n] dst mem)))))
 
+// TODO this does not fire before call expansion; is that acceptable?
 (StaticCall {sym} x) && needRaceCleanup(sym, v) => x
 
 // Collapse moving A -> B -> C into just A -> C.
diff --git a/src/cmd/compile/internal/ssa/gen/genericOps.go b/src/cmd/compile/internal/ssa/gen/genericOps.go
index 95edff4c8c..23bd4af2cd 100644
--- a/src/cmd/compile/internal/ssa/gen/genericOps.go
+++ b/src/cmd/compile/internal/ssa/gen/genericOps.go
@@ -389,10 +389,12 @@ var genericOps = []opData{
 	// TODO(josharian): ClosureCall and InterCall should have Int32 aux
 	// to match StaticCall's 32 bit arg size limit.
 	// TODO(drchase,josharian): could the arg size limit be bundled into the rules for CallOff?
-	{name: "ClosureCall", argLength: 3, aux: "CallOff", call: true},   // arg0=code pointer, arg1=context ptr, arg2=memory.  auxint=arg size.  Returns memory.
-	{name: "StaticCall", argLength: 1, aux: "CallOff", call: true},    // call function aux.(*obj.LSym), arg0=memory.  auxint=arg size.  Returns memory.
-	{name: "InterCall", argLength: 2, aux: "CallOff", call: true},     // interface call.  arg0=code pointer, arg1=memory, auxint=arg size.  Returns memory.
-	{name: "StaticLECall", argLength: -1, aux: "CallOff", call: true}, // late-expanded static call function aux.(*ssa.AuxCall.Fn). arg0..argN-1 are inputs, argN is mem. auxint = arg size. Result is tuple of result(s), plus mem.
+	{name: "ClosureCall", argLength: 3, aux: "CallOff", call: true},    // arg0=code pointer, arg1=context ptr, arg2=memory.  auxint=arg size.  Returns memory.
+	{name: "StaticCall", argLength: 1, aux: "CallOff", call: true},     // call function aux.(*obj.LSym), arg0=memory.  auxint=arg size.  Returns memory.
+	{name: "InterCall", argLength: 2, aux: "CallOff", call: true},      // interface call.  arg0=code pointer, arg1=memory, auxint=arg size.  Returns memory.
+	{name: "ClosureLECall", argLength: -1, aux: "CallOff", call: true}, // late-expanded closure call. arg0=code pointer, arg1=context ptr,  arg2..argN-1 are inputs, argN is mem. auxint = arg size. Result is tuple of result(s), plus mem.
+	{name: "StaticLECall", argLength: -1, aux: "CallOff", call: true},  // late-expanded static call function aux.(*ssa.AuxCall.Fn). arg0..argN-1 are inputs, argN is mem. auxint = arg size. Result is tuple of result(s), plus mem.
+	{name: "InterLECall", argLength: -1, aux: "CallOff", call: true},   // late-expanded interface call. arg0=code pointer, arg1..argN-1 are inputs, argN is mem. auxint = arg size. Result is tuple of result(s), plus mem.
 
 	// Conversions: signed extensions, zero (unsigned) extensions, truncations
 	{name: "SignExt8to16", argLength: 1, typ: "Int16"},
@@ -539,20 +541,22 @@ var genericOps = []opData{
 	{name: "SelectN", argLength: 1, aux: "Int64"},     // arg0=tuple, auxint=field index.  Returns the auxint'th member.
 	{name: "SelectNAddr", argLength: 1, aux: "Int64"}, // arg0=tuple, auxint=field index.  Returns the address of auxint'th member. Used for un-SSA-able result types.
 
-	// Atomic operations used for semantically inlining runtime/internal/atomic.
-	// Atomic loads return a new memory so that the loads are properly ordered
-	// with respect to other loads and stores.
-	// TODO: use for sync/atomic at some point.
+	// Atomic operations used for semantically inlining sync/atomic and
+	// runtime/internal/atomic. Atomic loads return a new memory so that
+	// the loads are properly ordered with respect to other loads and
+	// stores.
 	{name: "AtomicLoad8", argLength: 2, typ: "(UInt8,Mem)"},                                    // Load from arg0.  arg1=memory.  Returns loaded value and new memory.
 	{name: "AtomicLoad32", argLength: 2, typ: "(UInt32,Mem)"},                                  // Load from arg0.  arg1=memory.  Returns loaded value and new memory.
 	{name: "AtomicLoad64", argLength: 2, typ: "(UInt64,Mem)"},                                  // Load from arg0.  arg1=memory.  Returns loaded value and new memory.
 	{name: "AtomicLoadPtr", argLength: 2, typ: "(BytePtr,Mem)"},                                // Load from arg0.  arg1=memory.  Returns loaded value and new memory.
 	{name: "AtomicLoadAcq32", argLength: 2, typ: "(UInt32,Mem)"},                               // Load from arg0.  arg1=memory.  Lock acquisition, returns loaded value and new memory.
+	{name: "AtomicLoadAcq64", argLength: 2, typ: "(UInt64,Mem)"},                               // Load from arg0.  arg1=memory.  Lock acquisition, returns loaded value and new memory.
 	{name: "AtomicStore8", argLength: 3, typ: "Mem", hasSideEffects: true},                     // Store arg1 to *arg0.  arg2=memory.  Returns memory.
 	{name: "AtomicStore32", argLength: 3, typ: "Mem", hasSideEffects: true},                    // Store arg1 to *arg0.  arg2=memory.  Returns memory.
 	{name: "AtomicStore64", argLength: 3, typ: "Mem", hasSideEffects: true},                    // Store arg1 to *arg0.  arg2=memory.  Returns memory.
 	{name: "AtomicStorePtrNoWB", argLength: 3, typ: "Mem", hasSideEffects: true},               // Store arg1 to *arg0.  arg2=memory.  Returns memory.
 	{name: "AtomicStoreRel32", argLength: 3, typ: "Mem", hasSideEffects: true},                 // Store arg1 to *arg0.  arg2=memory.  Lock release, returns memory.
+	{name: "AtomicStoreRel64", argLength: 3, typ: "Mem", hasSideEffects: true},                 // Store arg1 to *arg0.  arg2=memory.  Lock release, returns memory.
 	{name: "AtomicExchange32", argLength: 3, typ: "(UInt32,Mem)", hasSideEffects: true},        // Store arg1 to *arg0.  arg2=memory.  Returns old contents of *arg0 and new memory.
 	{name: "AtomicExchange64", argLength: 3, typ: "(UInt64,Mem)", hasSideEffects: true},        // Store arg1 to *arg0.  arg2=memory.  Returns old contents of *arg0 and new memory.
 	{name: "AtomicAdd32", argLength: 3, typ: "(UInt32,Mem)", hasSideEffects: true},             // Do *arg0 += arg1.  arg2=memory.  Returns sum and new memory.
@@ -561,7 +565,9 @@ var genericOps = []opData{
 	{name: "AtomicCompareAndSwap64", argLength: 4, typ: "(Bool,Mem)", hasSideEffects: true},    // if *arg0==arg1, then set *arg0=arg2.  Returns true if store happens and new memory.
 	{name: "AtomicCompareAndSwapRel32", argLength: 4, typ: "(Bool,Mem)", hasSideEffects: true}, // if *arg0==arg1, then set *arg0=arg2.  Lock release, reports whether store happens and new memory.
 	{name: "AtomicAnd8", argLength: 3, typ: "Mem", hasSideEffects: true},                       // *arg0 &= arg1.  arg2=memory.  Returns memory.
+	{name: "AtomicAnd32", argLength: 3, typ: "Mem", hasSideEffects: true},                      // *arg0 &= arg1.  arg2=memory.  Returns memory.
 	{name: "AtomicOr8", argLength: 3, typ: "Mem", hasSideEffects: true},                        // *arg0 |= arg1.  arg2=memory.  Returns memory.
+	{name: "AtomicOr32", argLength: 3, typ: "Mem", hasSideEffects: true},                       // *arg0 |= arg1.  arg2=memory.  Returns memory.
 
 	// Atomic operation variants
 	// These variants have the same semantics as above atomic operations.
diff --git a/src/cmd/compile/internal/ssa/gen/rulegen.go b/src/cmd/compile/internal/ssa/gen/rulegen.go
index be51a7c5f8..120ccbbdb3 100644
--- a/src/cmd/compile/internal/ssa/gen/rulegen.go
+++ b/src/cmd/compile/internal/ssa/gen/rulegen.go
@@ -35,8 +35,7 @@ import (
 )
 
 // rule syntax:
-//  sexpr [&& extra conditions] -> [@block] sexpr  (untyped)
-//  sexpr [&& extra conditions] => [@block] sexpr  (typed)
+//  sexpr [&& extra conditions] => [@block] sexpr
 //
 // sexpr are s-expressions (lisp-like parenthesized groupings)
 // sexpr ::= [variable:](opcode sexpr*)
@@ -50,8 +49,12 @@ import (
 // variable ::= some token
 // opcode   ::= one of the opcodes from the *Ops.go files
 
+// special rules: trailing ellipsis "..." (in the outermost sexpr?) must match on both sides of a rule.
+//                trailing three underscore "___" in the outermost match sexpr indicate the presence of
+//                   extra ignored args that need not appear in the replacement
+
 // extra conditions is just a chunk of Go that evaluates to a boolean. It may use
-// variables declared in the matching sexpr. The variable "v" is predefined to be
+// variables declared in the matching tsexpr. The variable "v" is predefined to be
 // the value matched by the entire rule.
 
 // If multiple rules match, the first one in file order is selected.
@@ -75,14 +78,8 @@ func normalizeSpaces(s string) string {
 }
 
 // parse returns the matching part of the rule, additional conditions, and the result.
-// parse also reports whether the generated code should use strongly typed aux and auxint fields.
-func (r Rule) parse() (match, cond, result string, typed bool) {
-	arrow := "->"
-	if strings.Contains(r.Rule, "=>") {
-		arrow = "=>"
-		typed = true
-	}
-	s := strings.Split(r.Rule, arrow)
+func (r Rule) parse() (match, cond, result string) {
+	s := strings.Split(r.Rule, "=>")
 	match = normalizeSpaces(s[0])
 	result = normalizeSpaces(s[1])
 	cond = ""
@@ -90,7 +87,7 @@ func (r Rule) parse() (match, cond, result string, typed bool) {
 		cond = normalizeSpaces(match[i+2:])
 		match = normalizeSpaces(match[:i])
 	}
-	return match, cond, result, typed
+	return match, cond, result
 }
 
 func genRules(arch arch)          { genRulesSuffix(arch, "") }
@@ -116,7 +113,7 @@ func genRulesSuffix(arch arch, suff string) {
 	scanner := bufio.NewScanner(text)
 	rule := ""
 	var lineno int
-	var ruleLineno int // line number of "->" or "=>"
+	var ruleLineno int // line number of "=>"
 	for scanner.Scan() {
 		lineno++
 		line := scanner.Text()
@@ -130,13 +127,13 @@ func genRulesSuffix(arch arch, suff string) {
 		if rule == "" {
 			continue
 		}
-		if !strings.Contains(rule, "->") && !strings.Contains(rule, "=>") {
+		if !strings.Contains(rule, "=>") {
 			continue
 		}
 		if ruleLineno == 0 {
 			ruleLineno = lineno
 		}
-		if strings.HasSuffix(rule, "->") || strings.HasSuffix(rule, "=>") {
+		if strings.HasSuffix(rule, "=>") {
 			continue // continue on the next line
 		}
 		if n := balance(rule); n > 0 {
@@ -153,7 +150,7 @@ func genRulesSuffix(arch arch, suff string) {
 				continue
 			}
 			// Do fancier value op matching.
-			match, _, _, _ := r.parse()
+			match, _, _ := r.parse()
 			op, oparch, _, _, _, _ := parseValue(match, arch, loc)
 			opname := fmt.Sprintf("Op%s%s", oparch, op.name)
 			oprules[opname] = append(oprules[opname], r)
@@ -227,7 +224,7 @@ func genRulesSuffix(arch arch, suff string) {
 				log.Fatalf("unconditional rule %s is followed by other rules", rr.Match)
 			}
 			rr = &RuleRewrite{Loc: rule.Loc}
-			rr.Match, rr.Cond, rr.Result, rr.Typed = rule.parse()
+			rr.Match, rr.Cond, rr.Result = rule.parse()
 			pos, _ := genMatch(rr, arch, rr.Match, fn.ArgLen >= 0)
 			if pos == "" {
 				pos = "v.Pos"
@@ -786,7 +783,6 @@ type (
 		Alloc        int    // for unique var names
 		Loc          string // file name & line number of the original rule
 		CommuteDepth int    // used to track depth of commute loops
-		Typed        bool   // aux and auxint fields should be strongly typed
 	}
 	Declare struct {
 		Name  string
@@ -840,7 +836,7 @@ func breakf(format string, a ...interface{}) *CondBreak {
 
 func genBlockRewrite(rule Rule, arch arch, data blockData) *RuleRewrite {
 	rr := &RuleRewrite{Loc: rule.Loc}
-	rr.Match, rr.Cond, rr.Result, rr.Typed = rule.parse()
+	rr.Match, rr.Cond, rr.Result = rule.parse()
 	_, _, auxint, aux, s := extract(rr.Match) // remove parens, then split
 
 	// check match of control values
@@ -884,15 +880,6 @@ func genBlockRewrite(rule Rule, arch arch, data blockData) *RuleRewrite {
 		if e.name == "" {
 			continue
 		}
-		if !rr.Typed {
-			if !token.IsIdentifier(e.name) || rr.declared(e.name) {
-				// code or variable
-				rr.add(breakf("b.%s != %s", e.field, e.name))
-			} else {
-				rr.add(declf(e.name, "b.%s", e.field))
-			}
-			continue
-		}
 
 		if e.dclType == "" {
 			log.Fatalf("op %s has no declared type for %s", data.name, e.field)
@@ -961,20 +948,12 @@ func genBlockRewrite(rule Rule, arch arch, data blockData) *RuleRewrite {
 	}
 
 	if auxint != "" {
-		if rr.Typed {
-			// Make sure auxint value has the right type.
-			rr.add(stmtf("b.AuxInt = %sToAuxInt(%s)", unTitle(outdata.auxIntType()), auxint))
-		} else {
-			rr.add(stmtf("b.AuxInt = %s", auxint))
-		}
+		// Make sure auxint value has the right type.
+		rr.add(stmtf("b.AuxInt = %sToAuxInt(%s)", unTitle(outdata.auxIntType()), auxint))
 	}
 	if aux != "" {
-		if rr.Typed {
-			// Make sure aux value has the right type.
-			rr.add(stmtf("b.Aux = %sToAux(%s)", unTitle(outdata.auxType()), aux))
-		} else {
-			rr.add(stmtf("b.Aux = %s", aux))
-		}
+		// Make sure aux value has the right type.
+		rr.add(stmtf("b.Aux = %sToAux(%s)", unTitle(outdata.auxType()), aux))
 	}
 
 	succChanged := false
@@ -1019,6 +998,19 @@ func genMatch0(rr *RuleRewrite, arch arch, match, v string, cnt map[string]int,
 		pos = v + ".Pos"
 	}
 
+	// If the last argument is ___, it means "don't care about trailing arguments, really"
+	// The likely/intended use is for rewrites that are too tricky to express in the existing pattern language
+	// Do a length check early because long patterns fed short (ultimately not-matching) inputs will
+	// do an indexing error in pattern-matching.
+	if op.argLength == -1 {
+		l := len(args)
+		if l == 0 || args[l-1] != "___" {
+			rr.add(breakf("len(%s.Args) != %d", v, l))
+		} else if l > 1 && args[l-1] == "___" {
+			rr.add(breakf("len(%s.Args) < %d", v, l-1))
+		}
+	}
+
 	for _, e := range []struct {
 		name, field, dclType string
 	}{
@@ -1029,15 +1021,6 @@ func genMatch0(rr *RuleRewrite, arch arch, match, v string, cnt map[string]int,
 		if e.name == "" {
 			continue
 		}
-		if !rr.Typed {
-			if !token.IsIdentifier(e.name) || rr.declared(e.name) {
-				// code or variable
-				rr.add(breakf("%s.%s != %s", v, e.field, e.name))
-			} else {
-				rr.add(declf(e.name, "%s.%s", v, e.field))
-			}
-			continue
-		}
 
 		if e.dclType == "" {
 			log.Fatalf("op %s has no declared type for %s", op.name, e.field)
@@ -1159,9 +1142,6 @@ func genMatch0(rr *RuleRewrite, arch arch, match, v string, cnt map[string]int,
 		}
 	}
 
-	if op.argLength == -1 {
-		rr.add(breakf("len(%s.Args) != %d", v, len(args)))
-	}
 	return pos, checkOp
 }
 
@@ -1230,20 +1210,12 @@ func genResult0(rr *RuleRewrite, arch arch, result string, top, move bool, pos s
 	}
 
 	if auxint != "" {
-		if rr.Typed {
-			// Make sure auxint value has the right type.
-			rr.add(stmtf("%s.AuxInt = %sToAuxInt(%s)", v, unTitle(op.auxIntType()), auxint))
-		} else {
-			rr.add(stmtf("%s.AuxInt = %s", v, auxint))
-		}
+		// Make sure auxint value has the right type.
+		rr.add(stmtf("%s.AuxInt = %sToAuxInt(%s)", v, unTitle(op.auxIntType()), auxint))
 	}
 	if aux != "" {
-		if rr.Typed {
-			// Make sure aux value has the right type.
-			rr.add(stmtf("%s.Aux = %sToAux(%s)", v, unTitle(op.auxType()), aux))
-		} else {
-			rr.add(stmtf("%s.Aux = %s", v, aux))
-		}
+		// Make sure aux value has the right type.
+		rr.add(stmtf("%s.Aux = %sToAux(%s)", v, unTitle(op.auxType()), aux))
 	}
 	all := new(strings.Builder)
 	for i, arg := range args {
@@ -1524,7 +1496,7 @@ func excludeFromExpansion(s string, idx []int) bool {
 		return true
 	}
 	right := s[idx[1]:]
-	if strings.Contains(left, "&&") && (strings.Contains(right, "->") || strings.Contains(right, "=>")) {
+	if strings.Contains(left, "&&") && strings.Contains(right, "=>") {
 		// Inside && conditions.
 		return true
 	}
@@ -1626,7 +1598,6 @@ func normalizeWhitespace(x string) string {
 	x = strings.Replace(x, " )", ")", -1)
 	x = strings.Replace(x, "[ ", "[", -1)
 	x = strings.Replace(x, " ]", "]", -1)
-	x = strings.Replace(x, ")->", ") ->", -1)
 	x = strings.Replace(x, ")=>", ") =>", -1)
 	return x
 }
@@ -1683,7 +1654,7 @@ func parseEllipsisRules(rules []Rule, arch arch) (newop string, ok bool) {
 		return "", false
 	}
 	rule := rules[0]
-	match, cond, result, _ := rule.parse()
+	match, cond, result := rule.parse()
 	if cond != "" || !isEllipsisValue(match) || !isEllipsisValue(result) {
 		if strings.Contains(rule.Rule, "...") {
 			log.Fatalf("%s: found ellipsis in non-ellipsis rule", rule.Loc)
@@ -1708,7 +1679,7 @@ func isEllipsisValue(s string) bool {
 }
 
 func checkEllipsisRuleCandidate(rule Rule, arch arch) {
-	match, cond, result, _ := rule.parse()
+	match, cond, result := rule.parse()
 	if cond != "" {
 		return
 	}
@@ -1718,7 +1689,7 @@ func checkEllipsisRuleCandidate(rule Rule, arch arch) {
 	var usingCopy string
 	var eop opData
 	if result[0] != '(' {
-		// Check for (Foo x) -> x, which can be converted to (Foo ...) -> (Copy ...).
+		// Check for (Foo x) => x, which can be converted to (Foo ...) => (Copy ...).
 		args2 = []string{result}
 		usingCopy = " using Copy"
 	} else {
author	Cherry Zhang <cherryyz@google.com>	2020-10-28 09:12:20 -0400
committer	Cherry Zhang <cherryyz@google.com>	2020-10-28 09:12:20 -0400
commit	a16e30d162c1c7408db7821e7b9513cefa09c6ca (patch)
tree	af752ba9ba44c547df39bb0af9bff79f610ba9d5 /src/cmd/compile/internal/ssa/gen
parent	91e4d2d57bc341dd82c98247117114c851380aef (diff)
parent	cf6cfba4d5358404dd890f6025e573a4b2156543 (diff)
download	go-git-dev.link.tar.gz