Our pg keeps going into recovery mode after segfaulting. This is a compiled 8.3.3 on Ubuntu 6.04 with Slony.
Some ideas from IRC:
RhodiumToad: 1) probably least likely, something corrupt in the math libs; the fact that it's not reproducible makes this improbable
RhodiumToad: 2) more likely: a register or memory stomp in a signal handler, which could be the result of an OS bug or a pg miscompile
RhodiumToad: 3) slightly less likely: a memory stomp somewhere else in pg that happens to be clobbering something in the math library
Here is some gdb output from a core dump:
(gdb) bt
#0 0x00002aaaaae09a32 in fegetexcept () from /lib/libm.so.6
#1 0x00002aaaaae1e7e4 in log () from /lib/libm.so.6
#2 0x000000000055250d in cost_sort ()
#3 0x0000000000554757 in cost_mergejoin ()
#4 0x0000000000570673 in create_mergejoin_path ()
#5 0x000000000055a337 in add_paths_to_joinrel ()
#6 0x000000000055b650 in make_join_rel ()
#7 0x000000000055bb61 in join_search_one_level ()
#8 0x000000000055145d in standard_join_search ()
#9 0x0000000000563329 in query_planner ()
#10 0x0000000000563e98 in grouping_planner ()
#11 0x0000000000564d08 in subquery_planner ()
#12 0x000000000056511a in standard_planner ()
#13 0x00000000005a7271 in pg_plan_query ()
#14 0x00000000005a7877 in pg_plan_queries ()
#15 0x00000000005a7b2e in exec_simple_query ()
#16 0x00000000005a9495 in PostgresMain ()
#17 0x000000000057ec88 in ServerLoop ()
#18 0x000000000057f7cb in PostmasterMain ()
#19 0x00000000005370ee in main ()
(gdb) frame
#0 0x00002aaaaae09a32 in fegetexcept () from /lib/libm.so.6
(gdb) info reg
rax 0xa2dd80 10673536
rbx 0x3ffc000000000000 4610560118520545280
rcx 0x2b0 688
rdx 0x3fec000000000000 4606056518893174784
rsi 0x2aaaaae55968 46912499964264
rdi 0x6900 26880
rbp 0x2aaaaae55960 0x2aaaaae55960
rsp 0x7fffff877510 0x7fffff877510
r8 0x2aaaaae54e08 46912499961352
r9 0x2aaaaae54e00 46912499961344
r10 0x2aaaaae542a0 46912499958432
r11 0x6900 26880
r12 0x684c 26700
r13 0xffffffe0 4294967264
r14 0x2aaaaae53ce0 46912499956960
r15 0xa0a448 10527816
rip 0x2aaaaae09a32 0x2aaaaae09a32 <fegetexcept+48178>
eflags 0x10206 66054
cs 0x33 51
ss 0x2b 43
ds 0x0 0
es 0x0 0
fs 0x0 0
gs 0x0 0
(gdb) disass 0x2aaaaae09a00 0x2aaaaae09b00
Dump of assembler code from 0x2aaaaae09a00 to 0x2aaaaae09b00:
0x00002aaaaae09a00 <fegetexcept+48128>: and $0x8,%al
0x00002aaaaae09a02 <fegetexcept+48130>: mov 0x8(%rsp),%r11
0x00002aaaaae09a07 <fegetexcept+48135>: movlpd 317329(%rip),%xmm6 # 0x2aaaaae571a0 <__signbitl+131008>
0x00002aaaaae09a0f <fegetexcept+48143>: sar $0x20,%r11
0x00002aaaaae09a13 <fegetexcept+48147>: mulsd %xmm11,%xmm6
0x00002aaaaae09a18 <fegetexcept+48152>: and $0xfffff,%r11d
0x00002aaaaae09a1f <fegetexcept+48159>: addsd %xmm14,%xmm5
0x00002aaaaae09a24 <fegetexcept+48164>: sar $0x4,%r11d
0x00002aaaaae09a28 <fegetexcept+48168>: lea 0xffffffffffffff4c(%r11),%r12d
0x00002aaaaae09a2f <fegetexcept+48175>: movslq %r11d,%rdi
0x00002aaaaae09a32 <fegetexcept+48178>: mulsd (%r10,%rdi,8),%xmm8 ######### <------ where it segfaults
0x00002aaaaae09a38 <fegetexcept+48184>: shl $0x4,%rdi
0x00002aaaaae09a3c <fegetexcept+48188>: cvtsi2sd %r12d,%xmm9
0x00002aaaaae09a41 <fegetexcept+48193>: movlpd (%rdi,%rbp,1),%xmm3
0x00002aaaaae09a46 <fegetexcept+48198>: movlpd (%rdi,%rsi,1),%xmm13
0x00002aaaaae09a4c <fegetexcept+48204>: ucomisd %xmm3,%xmm12
0x00002aaaaae09a51 <fegetexcept+48209>: mulsd 317246(%rip),%xmm9 # 0x2aaaaae57198 <__signbitl+131000>
0x00002aaaaae09a5a <fegetexcept+48218>: addsd %xmm14,%xmm9
The value of r10 is the same every time, but rdi changes.
Andrew
Some ideas from IRC:
RhodiumToad: 1) probably least likely, something corrupt in the math libs; the fact that it's not reproducible makes this improbable
RhodiumToad: 2) more likely: a register or memory stomp in a signal handler, which could be the result of an OS bug or a pg miscompile
RhodiumToad: 3) slightly less likely: a memory stomp somewhere else in pg that happens to be clobbering something in the math library
Here is some gdb output from a core dump:
(gdb) bt
#0 0x00002aaaaae09a32 in fegetexcept () from /lib/libm.so.6
#1 0x00002aaaaae1e7e4 in log () from /lib/libm.so.6
#2 0x000000000055250d in cost_sort ()
#3 0x0000000000554757 in cost_mergejoin ()
#4 0x0000000000570673 in create_mergejoin_path ()
#5 0x000000000055a337 in add_paths_to_joinrel ()
#6 0x000000000055b650 in make_join_rel ()
#7 0x000000000055bb61 in join_search_one_level ()
#8 0x000000000055145d in standard_join_search ()
#9 0x0000000000563329 in query_planner ()
#10 0x0000000000563e98 in grouping_planner ()
#11 0x0000000000564d08 in subquery_planner ()
#12 0x000000000056511a in standard_planner ()
#13 0x00000000005a7271 in pg_plan_query ()
#14 0x00000000005a7877 in pg_plan_queries ()
#15 0x00000000005a7b2e in exec_simple_query ()
#16 0x00000000005a9495 in PostgresMain ()
#17 0x000000000057ec88 in ServerLoop ()
#18 0x000000000057f7cb in PostmasterMain ()
#19 0x00000000005370ee in main ()
(gdb) frame
#0 0x00002aaaaae09a32 in fegetexcept () from /lib/libm.so.6
(gdb) info reg
rax 0xa2dd80 10673536
rbx 0x3ffc000000000000 4610560118520545280
rcx 0x2b0 688
rdx 0x3fec000000000000 4606056518893174784
rsi 0x2aaaaae55968 46912499964264
rdi 0x6900 26880
rbp 0x2aaaaae55960 0x2aaaaae55960
rsp 0x7fffff877510 0x7fffff877510
r8 0x2aaaaae54e08 46912499961352
r9 0x2aaaaae54e00 46912499961344
r10 0x2aaaaae542a0 46912499958432
r11 0x6900 26880
r12 0x684c 26700
r13 0xffffffe0 4294967264
r14 0x2aaaaae53ce0 46912499956960
r15 0xa0a448 10527816
rip 0x2aaaaae09a32 0x2aaaaae09a32 <fegetexcept+48178>
eflags 0x10206 66054
cs 0x33 51
ss 0x2b 43
ds 0x0 0
es 0x0 0
fs 0x0 0
gs 0x0 0
(gdb) disass 0x2aaaaae09a00 0x2aaaaae09b00
Dump of assembler code from 0x2aaaaae09a00 to 0x2aaaaae09b00:
0x00002aaaaae09a00 <fegetexcept+48128>: and $0x8,%al
0x00002aaaaae09a02 <fegetexcept+48130>: mov 0x8(%rsp),%r11
0x00002aaaaae09a07 <fegetexcept+48135>: movlpd 317329(%rip),%xmm6 # 0x2aaaaae571a0 <__signbitl+131008>
0x00002aaaaae09a0f <fegetexcept+48143>: sar $0x20,%r11
0x00002aaaaae09a13 <fegetexcept+48147>: mulsd %xmm11,%xmm6
0x00002aaaaae09a18 <fegetexcept+48152>: and $0xfffff,%r11d
0x00002aaaaae09a1f <fegetexcept+48159>: addsd %xmm14,%xmm5
0x00002aaaaae09a24 <fegetexcept+48164>: sar $0x4,%r11d
0x00002aaaaae09a28 <fegetexcept+48168>: lea 0xffffffffffffff4c(%r11),%r12d
0x00002aaaaae09a2f <fegetexcept+48175>: movslq %r11d,%rdi
0x00002aaaaae09a32 <fegetexcept+48178>: mulsd (%r10,%rdi,8),%xmm8 ######### <------ where it segfaults
0x00002aaaaae09a38 <fegetexcept+48184>: shl $0x4,%rdi
0x00002aaaaae09a3c <fegetexcept+48188>: cvtsi2sd %r12d,%xmm9
0x00002aaaaae09a41 <fegetexcept+48193>: movlpd (%rdi,%rbp,1),%xmm3
0x00002aaaaae09a46 <fegetexcept+48198>: movlpd (%rdi,%rsi,1),%xmm13
0x00002aaaaae09a4c <fegetexcept+48204>: ucomisd %xmm3,%xmm12
0x00002aaaaae09a51 <fegetexcept+48209>: mulsd 317246(%rip),%xmm9 # 0x2aaaaae57198 <__signbitl+131000>
0x00002aaaaae09a5a <fegetexcept+48218>: addsd %xmm14,%xmm9
The value of r10 is the same every time, but rdi changes.
Andrew
No comments:
Post a Comment