incorrect vector codegen #42148

llvmbot · 2019-07-28T23:37:00Z


Bugzilla Link	42803
Resolution	FIXED
Resolved on	Feb 24, 2020 15:17
Version	9.0
OS	Linux
Blocks	#30613
Attachments	llvm-ir
Reporter	LLVM Bugzilla Contributor
CC	@andrewrk,@topperc,@efriedma-quic,@RKSimon
Fixed by commit(s)	`a5fa778`

Extended Description

Using llvm 8 (Ubuntu Disco, 8.0.0-3 (tags/RELEASE_800/final)) The following zig program:

pub export fn start() void {
var a: [3]bool = []bool{ false, true, false};
var x2: @Vector(3, bool) = a;
if (x2[1] != true) unreachable;
}
const builtin = @import("builtin");
pub fn panic(msg: []const u8, error_return_trace: ?*builtin.StackTrace) noreturn {
while (true) {}
}

compiles to this llvm-ir (full file included)

@0 = internal unnamed_addr constant [3 x i1] [i1 false, i1 true, i1 false], align 1

; Function Attrs: nobuiltin nounwind
define void @_start() #2 !dbg !136 {
Entry:
%a = alloca [3 x i1], align 1
%x2 = alloca <3 x i1>, align 4
%0 = bitcast [3 x i1]* %a to i8*, !dbg !147
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %0, i8* align 1 bitcast ([3 x i1]* @0 to i8*), i64 3, i1 false), !dbg !147
call void @llvm.dbg.declare(metadata [3 x i1]* %a, metadata !140, metadata !DIExpression()), !dbg !147
%1 = load [3 x i1], [3 x i1]* %a, !dbg !148
%vector_to_array = extractvalue [3 x i1] %1, 0, !dbg !148
%2 = insertelement <3 x i1> undef, i1 %vector_to_array, i32 0, !dbg !148
%vector_to_array1 = extractvalue [3 x i1] %1, 1, !dbg !148
%3 = insertelement <3 x i1> %2, i1 %vector_to_array1, i32 1, !dbg !148
%vector_to_array2 = extractvalue [3 x i1] %1, 2, !dbg !148
%4 = insertelement <3 x i1> %3, i1 %vector_to_array2, i32 2, !dbg !148
store <3 x i1> %4, <3 x i1>* %x2, align 4, !dbg !149
call void @llvm.dbg.declare(metadata <3 x i1>* %x2, metadata !145, metadata !DIExpression()), !dbg !149
%5 = load <3 x i1>, <3 x i1>* %x2, align 4, !dbg !150
%6 = extractelement <3 x i1> %5, i32 1, !dbg !150
%7 = icmp ne i1 %6, true, !dbg !151
br i1 %7, label %Then, label %Else, !dbg !152

Then: ; preds = %Entry
tail call fastcc void @panic(%"[]u8"* @2, %builtin.StackTrace* null), !dbg !153
unreachable, !dbg !153

Else: ; preds = %Entry
br label %EndIf, !dbg !152

EndIf: ; preds = %Else
ret void, !dbg !154
}

Which produces the following x86_64 (disassembled)

Dump of assembler code for function _start:
0x0000000000201010 <+0>: push %rbp
0x0000000000201011 <+1>: mov %rsp,%rbp
=> 0x0000000000201014 <+4>: sub $0x10,%rsp
0x0000000000201018 <+8>: mov -0xef8(%rip),%al # 0x200126 <__unnamed_1+2>
0x000000000020101e <+14>: mov %al,-0x2(%rbp)
0x0000000000201021 <+17>: mov -0xf04(%rip),%cx # 0x200124 <__unnamed_1>
0x0000000000201028 <+24>: mov %cx,-0x4(%rbp)
0x000000000020102c <+28>: mov -0x2(%rbp),%al
0x000000000020102f <+31>: mov -0x4(%rbp),%dl
0x0000000000201032 <+34>: mov -0x3(%rbp),%sil
0x0000000000201036 <+38>: add %sil,%sil
0x0000000000201039 <+41>: or %sil,%dl
0x000000000020103c <+44>: shl $0x2,%al
0x000000000020103f <+47>: or %al,%dl
0x0000000000201041 <+49>: and $0x7,%dl
0x0000000000201044 <+52>: mov %dl,-0xc(%rbp)
0x0000000000201047 <+55>: mov -0xc(%rbp),%al
0x000000000020104a <+58>: mov %al,-0x8(%rbp)
0x000000000020104d <+61>: mov -0x8(%rbp),%al
0x0000000000201050 <+64>: and $0x1,%al
0x0000000000201052 <+66>: neg %al
0x0000000000201054 <+68>: test $0x1,%al
0x0000000000201056 <+70>: jne 0x20106d <_start+93>
0x0000000000201058 <+72>: jmp 0x20105a <_start+74>
0x000000000020105a <+74>: xor %eax,%eax
0x000000000020105c <+76>: mov %eax,%esi
0x000000000020105e <+78>: movabs $0x200140,%rdi
0x0000000000201068 <+88>: callq 0x201000
0x000000000020106d <+93>: jmp 0x20106f <_start+95>
0x000000000020106f <+95>: add $0x10,%rsp
0x0000000000201073 <+99>: pop %rbp
0x0000000000201074 <+100>: retq

The text was updated successfully, but these errors were encountered:

topperc · 2019-07-29T00:05:13Z

By "bad" do you mean incorrect or not optimal?

llvmbot · 2019-07-29T00:39:12Z

Incorrect. The middle element is false, and the program panics.

llvmbot · 2019-07-29T00:39:58Z

runnable executable

llvmbot · 2019-07-29T00:46:20Z

Here is a version that can be built with current zig (rather than my branch), and reproduces the problem, but its more difficult to follow, so I don't recommend looking at it. https://godbolt.org/z/o6lm3Z

llvmbot · 2019-07-29T01:28:04Z

I reproduced it, and reduced the test case on ARM:

pub export fn start() void {
var a: [3]bool = []bool{ false, true, false};
var x2: @Vector(3, bool) = a;
if (x2[1] != true) unreachable;
}
const builtin = @import("builtin");
pub fn panic(msg: []const u8, error_return_trace: ?*builtin.StackTrace) noreturn {
while (true) {}
}

=======================

arget datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64v8-unknown-linux-gnu"

%"[]u8" = type { i8*, i64 }
%"[]usize" = type { i64*, i64 }

@0 = internal unnamed_addr constant [3 x i1] [i1 false, i1 true, i1 false], align 1

; Function Attrs: nobuiltin noreturn nounwind
define internal fastcc void @panic() unnamed_addr #0 {
Entry:
br label %WhileCond

WhileCond: ; preds = %WhileCond, %Entry
br label %WhileCond
}

; Function Attrs: nobuiltin nounwind
define void @_start() #2 {
Entry:
%a = alloca [3 x i1], align 1
%0 = bitcast [3 x i1]* %a to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %0, i8* align 1 bitcast ([3 x i1]* @0 to i8*), i64 3, i1 false)
%1 = load [3 x i1], [3 x i1]* %a
%vector_to_array = extractvalue [3 x i1] %1, 0
%2 = insertelement <3 x i1> undef, i1 %vector_to_array, i32 0
%vector_to_array1 = extractvalue [3 x i1] %1, 1
%3 = insertelement <3 x i1> %2, i1 %vector_to_array1, i32 1
%vector_to_array2 = extractvalue [3 x i1] %1, 2
%4 = insertelement <3 x i1> %3, i1 %vector_to_array2, i32 2
%v6 = extractelement <3 x i1> %4, i32 1
%v7 = icmp ne i1 %v6, true
br i1 %v7, label %Then, label %Else

Then: ; preds = %Entry
tail call fastcc void @panic()
unreachable

Else: ; preds = %Entry
br label %EndIf

EndIf: ; preds = %Else
ret void
}

; Function Attrs: argmemonly nounwind
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) #3

attributes #0 = { nobuiltin noreturn nounwind "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
attributes #1 = { nounwind readnone speculatable }
attributes #2 = { nobuiltin nounwind "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
attributes #3 = { argmemonly nounwind }

=========

    .text
    .file   "min.ll"
    .p2align        2               // -- Begin function panic
    .type   panic,@function

panic: // @panic
// %bb.0: // %Entry
.LBB0_1: // %WhileCond
// =>This Inner Loop Header: Depth=1
b .LBB0_1
.Lfunc_end0:
.size panic, .Lfunc_end0-panic
// -- End function
.globl _start // -- Begin function _start
.p2align 2
.type _start,@function
_start: // @_start
// %bb.0: // %Entry
sub sp, sp, #32 // =32
adrp x8, __unnamed_1
add x8, x8, :lo12:__unnamed_1
ldrh w9, [x8]
ldrb w8, [x8, #2]
stp x29, x30, [sp, #16] // 16-byte Folded Spill
add x29, sp, #16 // =16
sturh w9, [x29, #-4]
ubfx w9, w9, #8, #8
cmp w9, #1 // =1
sturb w8, [x29, #-2]
b.ne .LBB1_2
// %bb.1: // %EndIf
ldp x29, x30, [sp, #16] // 16-byte Folded Reload
add sp, sp, #32 // =32
ret
.LBB1_2: // %Then
bl panic
.Lfunc_end1:
.size _start, .Lfunc_end1-_start
// -- End function
.type __unnamed_1,@object // @0
.section .rodata,"a",@progbits
__unnamed_1:
.byte 0 // 0x0
.byte 1 // 0x1
.byte 0 // 0x0
.size __unnamed_1, 3

    .section        ".note.GNU-stack","",@progbits

topperc · 2019-07-29T03:11:17Z

Do you know if this reproduces with trunk?

llvmbot · 2019-07-29T03:24:18Z

I reproduced it with 9.0.0-svn362869-1~exp1 in Debian experimental on arm64. The generated assembly appeared to be identical.

topperc · 2019-07-29T17:44:44Z

Are you compiling with sse disabled? That's the only way I could get the assembly you were showing.

The issue seems to be a bug in the handling of vector loads that don't fit exactly into some number of bytes. We get confused and end up treating all 3 bits as being in bit 0 of the same byte.

llvmbot · 2019-07-29T18:50:42Z

Are you compiling with sse disabled?

Zig uses -march=native and this laptop supports AVX-256. I also reproduced it on aarch64.

llvmbot · 2019-07-29T18:51:45Z

Ignore that. These are all with optimizations disabled, -O0. I could not reproduce with optimizations enabled.

efriedma-quic · 2019-07-29T18:59:33Z

Target-independent legalization for load and store operations on vector types with non-byte-sized element types has been a known problem in the past... it wasn't clear how we wanted to represent them. We've settled on tightly packing them (so <3 x i1> is one byte), and we fixed most of the legalization code to correctly honor that. But maybe there's something that still isn't fixed.

I can't reproduce the issue with the AArch64 testcase; can you give complete steps for building an executable that reproduces the issue?

topperc · 2019-07-29T19:08:18Z

Eli, on X86 at least with sse disabled. What I see happened in that the v3i1 store needs to be widened to v4i1 during type legalization. The call to FindMemType in the load widening code ends up returning i1 as the type because no other types evenly divide into WidenWidth which is 4. This causes the code to emit 3 identical i1 loads since the pointer doesn't increment. These of course get CSEd to the same node. This is then assembled together to make a v3i1 vector. Since the loads are just i1 the element offset within in the byte is now lost.

topperc · 2019-07-29T19:09:08Z

Oops that should have said "v3i1 load needs to be widened". v3i1 stores already scalarize, but not loads.

efriedma-quic · 2019-07-29T19:15:46Z

DAGTypeLegalizer::WidenVecOp_STORE has a special case for non-byte-sized stores. (See https://reviews.llvm.org/D42100 .) Looks like we never applied an equivalent fix to DAGTypeLegalizer::WidenVecRes_LOAD.

andrewrk · 2020-02-13T22:11:09Z

This is still an open downstream bug with LLVM 10 rc1. ziglang/zig#3246 (comment)

topperc · 2020-02-14T06:27:41Z

Candidate patch https://reviews.llvm.org/D74590

Though I tend to think that zig should not be generating vectors of i1 in memory.

llvmbot · 2020-02-14T06:31:11Z

Though I tend to think that zig should not be generating vectors of i1 in memory.

All the vector comparison operators return vectors of i1. On x86 this ends up as a mask registers, and on PPC or ARM this ends up as a i1 sign-extended to the vector width of the operators.

efriedma-quic · 2020-02-14T21:25:52Z

"should not", meaning that you're going to get terrible quality code, and likely always will. SIMD instruction sets tend to have wildly different ways of representing compare results depending on the target and the type of the compare operands.

andrewrk · 2020-02-14T21:28:50Z

LLVM's icmp instruction when used on vectors returns . Why design the instructions this way if the return type is not recommended?

http://llvm.org/docs/LangRef.html#icmp-instruction

efriedma-quic · 2020-02-14T23:31:41Z

Making icmp return is semantically meaningful. And it gives the backend the flexibility to easily do the right thing across all the targets LLVM supports if the result is used in a select, or sign-extended, or something like that.

The point at which code generation becomes tricky is when you try to store the vector to memory.

andrewrk · 2020-02-15T01:27:53Z

Ah, I see, thanks. Zig is storing vectors of i1 in memory only in the same way that clang stores function parameters in memory - in order to emit debug info, and so that programmers can name intermediate values. It's fully expected that mem2reg would remove these. I'll double check that this is happening. Thanks

topperc · 2020-02-24T23:17:58Z

Maybe fixed after a5fa778. I only checked the output I didn't run anything.

RKSimon mentioned this issue Dec 5, 2016

[Meta] Load/Store/Bitcast Handling of <X x i1> bool vectors #30613

Open

llvmbot transferred this issue from llvm/llvm-bugzilla-archive Dec 10, 2021

This issue was closed.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

incorrect vector codegen #42148

incorrect vector codegen #42148

llvmbot commented Jul 28, 2019

topperc commented Jul 29, 2019

llvmbot commented Jul 29, 2019

llvmbot commented Jul 29, 2019

llvmbot commented Jul 29, 2019

llvmbot commented Jul 29, 2019

topperc commented Jul 29, 2019

llvmbot commented Jul 29, 2019

topperc commented Jul 29, 2019

llvmbot commented Jul 29, 2019

llvmbot commented Jul 29, 2019

efriedma-quic commented Jul 29, 2019

topperc commented Jul 29, 2019

topperc commented Jul 29, 2019

efriedma-quic commented Jul 29, 2019

andrewrk commented Feb 13, 2020

topperc commented Feb 14, 2020

llvmbot commented Feb 14, 2020

efriedma-quic commented Feb 14, 2020

andrewrk commented Feb 14, 2020

efriedma-quic commented Feb 14, 2020

andrewrk commented Feb 15, 2020

topperc commented Feb 24, 2020

incorrect vector codegen #42148

incorrect vector codegen #42148

Comments

llvmbot commented Jul 28, 2019

Extended Description

topperc commented Jul 29, 2019

llvmbot commented Jul 29, 2019

llvmbot commented Jul 29, 2019

llvmbot commented Jul 29, 2019

llvmbot commented Jul 29, 2019

topperc commented Jul 29, 2019

llvmbot commented Jul 29, 2019

topperc commented Jul 29, 2019

llvmbot commented Jul 29, 2019

llvmbot commented Jul 29, 2019

efriedma-quic commented Jul 29, 2019

topperc commented Jul 29, 2019

topperc commented Jul 29, 2019

efriedma-quic commented Jul 29, 2019

andrewrk commented Feb 13, 2020

topperc commented Feb 14, 2020

llvmbot commented Feb 14, 2020

efriedma-quic commented Feb 14, 2020

andrewrk commented Feb 14, 2020

efriedma-quic commented Feb 14, 2020

andrewrk commented Feb 15, 2020

topperc commented Feb 24, 2020