2007-10-25 21:42:04 +04:00
# Unified Makefile for i386 and x86_64
2007-10-25 22:31:19 +04:00
# select defconfig based on actual architecture
2007-11-12 22:14:19 +03:00
i f e q ( $( ARCH ) , x 8 6 )
2012-12-21 01:51:55 +04:00
ifeq ( $( shell uname -m) ,x86_64)
KBUILD_DEFCONFIG := x86_64_defconfig
else
2007-11-12 22:14:19 +03:00
KBUILD_DEFCONFIG := i386_defconfig
2012-12-21 01:51:55 +04:00
endif
2007-11-12 22:14:19 +03:00
e l s e
KBUILD_DEFCONFIG := $( ARCH) _defconfig
e n d i f
2007-10-25 22:31:19 +04:00
2014-01-08 15:21:20 +04:00
# How to compile the 16-bit code. Note we always compile for -march=i386;
# that way we can complain to the user if the CPU is insufficient.
2014-01-29 16:16:47 +04:00
#
# The -m16 option is supported by GCC >= 4.9 and clang >= 3.5. For
2014-06-05 00:16:48 +04:00
# older versions of GCC, include an *assembly* header to make sure that
# gcc doesn't play any games behind our back.
CODE16GCC_CFLAGS := -m32 -Wa,$( srctree) /arch/x86/boot/code16gcc.h
2014-01-29 16:16:47 +04:00
M16_CFLAGS := $( call cc-option, -m16, $( CODE16GCC_CFLAGS) )
REALMODE_CFLAGS := $( M16_CFLAGS) -g -Os -D__KERNEL__ \
-DDISABLE_BRANCH_PROFILING \
2014-01-08 15:21:20 +04:00
-Wall -Wstrict-prototypes -march= i386 -mregparm= 3 \
-fno-strict-aliasing -fomit-frame-pointer -fno-pic \
-mno-mmx -mno-sse \
$( call cc-option, -ffreestanding) \
$( call cc-option, -fno-stack-protector) \
$( call cc-option, -mpreferred-stack-boundary= 2)
export REALMODE_CFLAGS
2008-01-30 15:32:20 +03:00
# BITS is used as extension for files which are available in a 32 bit
# and a 64 bit version to simplify shared Makefiles.
# e.g.: obj-y += foo_$(BITS).o
export BITS
2007-10-25 21:42:04 +04:00
2013-07-08 20:15:17 +04:00
i f d e f C O N F I G _ X 8 6 _ N E E D _ R E L O C S
LDFLAGS_vmlinux := --emit-relocs
e n d i f
2015-07-21 19:27:18 +03:00
#
# Prevent GCC from generating any FP code by mistake.
#
# This must happen before we try the -mpreferred-stack-boundary, see:
#
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
#
KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
KBUILD_CFLAGS += $( call cc-option,-mno-avx,)
2007-11-12 22:14:19 +03:00
i f e q ( $( CONFIG_X 86_ 32) , y )
2008-01-30 15:32:20 +03:00
BITS := 32
2007-11-20 01:58:57 +03:00
UTS_MACHINE := i386
2008-01-30 15:32:23 +03:00
CHECKFLAGS += -D__i386__
2008-01-30 15:32:20 +03:00
2008-01-30 15:32:23 +03:00
biarch := $( call cc-option,-m32)
KBUILD_AFLAGS += $( biarch)
KBUILD_CFLAGS += $( biarch)
2008-01-30 15:32:20 +03:00
KBUILD_CFLAGS += -msoft-float -mregparm= 3 -freg-struct-return
2012-08-10 22:49:06 +04:00
# Never want PIC in a 32-bit kernel, prevent breakage with GCC built
# with nonstandard options
KBUILD_CFLAGS += -fno-pic
2008-01-30 15:32:20 +03:00
# prevent gcc from keeping the stack 16 byte aligned
KBUILD_CFLAGS += $( call cc-option,-mpreferred-stack-boundary= 2)
# Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use
# a lot more stack due to the lack of sharing of stacklots:
2009-07-23 22:56:27 +04:00
KBUILD_CFLAGS += $( call cc-ifversion, -lt, 0400, \
$( call cc-option,-fno-unit-at-a-time) )
2008-01-30 15:32:20 +03:00
# CPU-specific tuning. Anything which can be shared with UML should go here.
2015-03-27 14:43:36 +03:00
include arch/x86/Makefile_32.cpu
2008-01-30 15:32:20 +03:00
KBUILD_CFLAGS += $( cflags-y)
# temporary until string.h is fixed
KBUILD_CFLAGS += -ffreestanding
2007-10-25 21:42:04 +04:00
e l s e
2008-01-30 15:32:20 +03:00
BITS := 64
2007-11-20 01:58:57 +03:00
UTS_MACHINE := x86_64
2008-01-30 15:32:20 +03:00
CHECKFLAGS += -D__x86_64__ -m64
2014-05-08 01:05:52 +04:00
biarch := -m64
2008-01-30 15:32:20 +03:00
KBUILD_AFLAGS += -m64
KBUILD_CFLAGS += -m64
x86: Align jump targets to 1-byte boundaries
The following NOP in a hot function caught my attention:
> 5a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
That's a dead NOP that bloats the function a bit, added for the
default 16-byte alignment that GCC applies for jump targets.
I realize that x86 CPU manufacturers recommend 16-byte jump
target alignments (it's in the Intel optimization manual),
to help their relatively narrow decoder prefetch alignment
and uop cache constraints, but the cost of that is very
significant:
text data bss dec filename
12566391 1617840 1089536 15273767 vmlinux.align.16-byte
12224951 1617840 1089536 14932327 vmlinux.align.1-byte
By using 1-byte jump target alignment (i.e. no alignment at all)
we get an almost 3% reduction in kernel size (!) - and a
probably similar reduction in I$ footprint.
Now, the usual justification for jump target alignment is the
following:
- modern decoders tend to have 16-byte (effective) decoder
prefetch windows. (AMD documents it higher but measurements
suggest the effective prefetch window on curretn uarchs is
still around 16 bytes)
- on Intel there's also the uop-cache with cachelines that have
16-byte granularity and limited associativity.
- older x86 uarchs had a penalty for decoder fetches that crossed
16-byte boundaries. These limits are mostly gone from recent
uarchs.
So if a forward jump target is aligned to cacheline boundary then
prefetches will start from a new prefetch-cacheline and there's
higher chance for decoding in fewer steps and packing tightly.
But I think that argument is flawed for typical optimized kernel
code flows: forward jumps often go to 'cold' (uncommon) pieces
of code, and aligning cold code to cache lines does not bring a
lot of advantages (they are uncommon), while it causes
collateral damage:
- their alignment 'spreads out' the cache footprint, it shifts
followup hot code further out
- plus it slows down even 'cold' code that immediately follows 'hot'
code (like in the above case), which could have benefited from the
partial cacheline that comes off the end of hot code.
But even in the cache-hot case the 16 byte alignment brings
disadvantages:
- it spreads out the cache footprint, possibly making the code
fall out of the L1 I$.
- On Intel CPUs, recent microarchitectures have plenty of
uop cache (typically doubling every 3 years) - while the
size of the L1 cache grows much less aggressively. So
workloads are rarely uop cache limited.
The only situation where alignment might matter are tight
loops that could fit into a single 16 byte chunk - but those
are pretty rare in the kernel: if they exist they tend
to be pointer chasing or generic memory ops, which both tend
to be cache miss (or cache allocation) intensive and are not
decoder bandwidth limited.
So the balance of arguments strongly favors packing kernel
instructions tightly versus maximizing for decoder bandwidth:
this patch changes the jump target alignment from 16 bytes
to 1 byte (tightly packed, unaligned).
Acked-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Aswin Chandramouleeswaran <aswin@hp.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jason Low <jason.low2@hp.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Link: http://lkml.kernel.org/r/20150410120846.GA17101@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-04-10 15:08:46 +03:00
# Align jump targets to 1 byte, not the default 16 bytes:
KBUILD_CFLAGS += -falign-jumps= 1
2015-05-17 08:56:54 +03:00
# Pack loops tightly as well:
KBUILD_CFLAGS += -falign-loops= 1
2014-09-10 20:05:39 +04:00
# Don't autogenerate traditional x87 instructions
2014-04-22 09:40:27 +04:00
KBUILD_CFLAGS += $( call cc-option,-mno-80387)
KBUILD_CFLAGS += $( call cc-option,-mno-fp-ret-in-387)
2013-11-21 01:31:49 +04:00
2012-05-30 01:31:23 +04:00
# Use -mpreferred-stack-boundary=3 if supported.
2013-11-21 01:31:49 +04:00
KBUILD_CFLAGS += $( call cc-option,-mpreferred-stack-boundary= 3)
2012-05-30 01:31:23 +04:00
2014-12-18 05:05:29 +03:00
# Use -mskip-rax-setup if supported.
KBUILD_CFLAGS += $( call cc-option,-mskip-rax-setup)
2008-01-30 15:32:20 +03:00
# FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
cflags-$( CONFIG_MK8) += $( call cc-option,-march= k8)
cflags-$( CONFIG_MPSC) += $( call cc-option,-march= nocona)
cflags-$( CONFIG_MCORE2) += \
$( call cc-option,-march= core2,$( call cc-option,-mtune= generic) )
2009-08-22 01:06:23 +04:00
cflags-$( CONFIG_MATOM) += $( call cc-option,-march= atom) \
$( call cc-option,-mtune= atom,$( call cc-option,-mtune= generic) )
2008-01-30 15:32:20 +03:00
cflags-$( CONFIG_GENERIC_CPU) += $( call cc-option,-mtune= generic)
KBUILD_CFLAGS += $( cflags-y)
KBUILD_CFLAGS += -mno-red-zone
KBUILD_CFLAGS += -mcmodel= kernel
# -funit-at-a-time shrinks the kernel .text considerably
# unfortunately it makes reading oopses harder.
KBUILD_CFLAGS += $( call cc-option,-funit-at-a-time)
# this works around some issues with generating unwind tables in older gccs
# newer gccs do it by default
2013-10-20 02:36:47 +04:00
KBUILD_CFLAGS += $( call cc-option,-maccumulate-outgoing-args)
2009-02-09 16:17:39 +03:00
e n d i f
2008-01-30 15:32:20 +03:00
2012-02-28 02:09:10 +04:00
i f d e f C O N F I G _ X 8 6 _ X 3 2
x32_ld_ok := $( call try-run,\
/bin/echo -e '1: .quad 1b' | \
2012-10-02 18:42:36 +04:00
$( CC) $( KBUILD_AFLAGS) -c -x assembler -o " $$ TMP " - && \
2012-02-28 02:09:10 +04:00
$( OBJCOPY) -O elf32-x86-64 " $$ TMP " " $$ TMPO " && \
$( LD) -m elf32_x86_64 " $$ TMPO " -o " $$ TMP " ,y,n)
2012-02-28 13:35:06 +04:00
ifeq ( $( x32_ld_ok) ,y)
CONFIG_X86_X32_ABI := y
KBUILD_AFLAGS += -DCONFIG_X86_X32_ABI
KBUILD_CFLAGS += -DCONFIG_X86_X32_ABI
else
$( warning CONFIG_X86_X32 enabled but no binutils support)
endif
2012-02-28 02:09:10 +04:00
e n d i f
export CONFIG_X86_X32_ABI
2008-04-04 02:51:41 +04:00
# Don't unroll struct assignments with kmemcheck enabled
i f e q ( $( CONFIG_KMEMCHECK ) , y )
KBUILD_CFLAGS += $( call cc-option,-fno-builtin-memcpy)
e n d i f
2008-01-30 15:32:20 +03:00
# Stackpointer is addressed different for 32 bit and 64 bit x86
sp-$(CONFIG_X86_32) := esp
sp-$(CONFIG_X86_64) := rsp
2015-10-06 03:47:57 +03:00
# do binutils support CFI?
cfi := $( call as-instr,.cfi_startproc\n .cfi_rel_offset $( sp-y) $( comma) 0\n .cfi_endproc,-DCONFIG_AS_CFI= 1)
# is .cfi_signal_frame supported too?
cfi-sigframe := $( call as-instr,.cfi_startproc\n .cfi_signal_frame\n .cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME= 1)
cfi-sections := $( call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS= 1)
2010-10-14 03:00:29 +04:00
# does binutils support specific instructions?
asinstr := $( call as-instr,fxsaveq ( %rax) ,-DCONFIG_AS_FXSAVEQ= 1)
2015-01-23 11:29:50 +03:00
asinstr += $( call as-instr,pshufb %xmm0$( comma) %xmm0,-DCONFIG_AS_SSSE3= 1)
2014-02-27 12:47:02 +04:00
asinstr += $( call as-instr,crc32l %eax$( comma) %eax,-DCONFIG_AS_CRC32= 1)
2012-05-22 07:54:04 +04:00
avx_instr := $( call as-instr,vxorps %ymm0$( comma) %ymm1$( comma) %ymm2,-DCONFIG_AS_AVX= 1)
2012-11-09 01:47:44 +04:00
avx2_instr := $( call as-instr,vpbroadcastb %xmm0$( comma) %ymm1,-DCONFIG_AS_AVX2= 1)
2016-08-13 04:03:19 +03:00
avx512_instr := $( call as-instr,vpmovm2b %k1$( comma) %zmm5,-DCONFIG_AS_AVX512= 1)
2015-09-11 01:27:26 +03:00
sha1_ni_instr := $( call as-instr,sha1msg1 %xmm0$( comma) %xmm1,-DCONFIG_AS_SHA1_NI= 1)
sha256_ni_instr := $( call as-instr,sha256msg1 %xmm0$( comma) %xmm1,-DCONFIG_AS_SHA256_NI= 1)
2010-10-14 03:00:29 +04:00
2016-08-13 04:03:19 +03:00
KBUILD_AFLAGS += $( cfi) $( cfi-sigframe) $( cfi-sections) $( asinstr) $( avx_instr) $( avx2_instr) $( avx512_instr) $( sha1_ni_instr) $( sha256_ni_instr)
KBUILD_CFLAGS += $( cfi) $( cfi-sigframe) $( cfi-sections) $( asinstr) $( avx_instr) $( avx2_instr) $( avx512_instr) $( sha1_ni_instr) $( sha256_ni_instr)
2008-01-30 15:32:20 +03:00
LDFLAGS := -m elf_$( UTS_MACHINE)
2008-01-30 15:32:21 +03:00
# Speed up the build
2008-01-30 15:32:20 +03:00
KBUILD_CFLAGS += -pipe
2008-01-30 15:32:21 +03:00
# Workaround for a gcc prelease that unfortunately was shipped in a suse release
KBUILD_CFLAGS += -Wno-sign-compare
#
KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
2008-01-30 15:32:20 +03:00
KBUILD_CFLAGS += $( mflags-y)
KBUILD_AFLAGS += $( mflags-y)
2012-10-15 23:16:56 +04:00
archscripts : scripts_basic
2012-05-08 22:22:24 +04:00
$( Q) $( MAKE) $( build) = arch/x86/tools relocs
2011-11-12 04:07:41 +04:00
###
# Syscall table generation
archheaders :
2015-06-03 19:36:41 +03:00
$( Q) $( MAKE) $( build) = arch/x86/entry/syscalls all
2011-11-12 04:07:41 +04:00
2014-08-09 01:26:02 +04:00
archprepare :
2014-08-30 02:18:46 +04:00
i f e q ( $( CONFIG_KEXEC_FILE ) , y )
2014-08-09 01:26:02 +04:00
$( Q) $( MAKE) $( build) = arch/x86/purgatory arch/x86/purgatory/kexec-purgatory.c
e n d i f
2008-01-30 15:32:20 +03:00
###
# Kernel objects
2008-02-23 11:58:20 +03:00
head-y := arch/x86/kernel/head_$( BITS) .o
head-y += arch/x86/kernel/head$( BITS) .o
2016-04-14 03:04:43 +03:00
head-y += arch/x86/kernel/ebda.o
2016-04-14 03:04:34 +03:00
head-y += arch/x86/kernel/platform-quirks.o
2008-01-30 15:32:20 +03:00
libs-y += arch/x86/lib/
2009-04-15 23:34:55 +04:00
# See arch/x86/Kbuild for content of core part of the kernel
core-y += arch/x86/
2008-01-30 15:32:20 +03:00
# drivers-y are linked after core-y
drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/
drivers-$(CONFIG_PCI) += arch/x86/pci/
# must be linked after kernel/
drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/
2008-02-10 01:24:09 +03:00
# suspend and hibernation support
2008-01-30 15:32:20 +03:00
drivers-$(CONFIG_PM) += arch/x86/power/
2008-02-10 01:24:09 +03:00
2008-01-30 15:32:20 +03:00
drivers-$(CONFIG_FB) += arch/x86/video/
2015-08-12 19:29:45 +03:00
drivers-$(CONFIG_RAS) += arch/x86/ras/
2008-01-30 15:32:20 +03:00
####
# boot loader support. Several targets are kept for legacy purposes
boot := arch/x86/boot
2009-04-17 21:46:37 +04:00
BOOT_TARGETS = bzlilo bzdisk fdimage fdimage144 fdimage288 isoimage
2009-03-12 22:50:33 +03:00
PHONY += bzImage $( BOOT_TARGETS)
2008-01-30 15:32:20 +03:00
# Default kernel to build
all : bzImage
# KBUILD_IMAGE specify target image being built
2009-03-12 22:50:33 +03:00
KBUILD_IMAGE := $( boot) /bzImage
2008-01-30 15:32:20 +03:00
2009-03-12 22:50:33 +03:00
bzImage : vmlinux
2009-08-14 00:34:21 +04:00
i f e q ( $( CONFIG_X 86_DECODER_SELFTEST ) , y )
$( Q) $( MAKE) $( build) = arch/x86/tools posttest
e n d i f
2008-01-30 15:32:20 +03:00
$( Q) $( MAKE) $( build) = $( boot) $( KBUILD_IMAGE)
$( Q) mkdir -p $( objtree) /arch/$( UTS_MACHINE) /boot
2008-04-22 19:29:26 +04:00
$( Q) ln -fsn ../../x86/boot/bzImage $( objtree) /arch/$( UTS_MACHINE) /boot/$@
2008-01-30 15:32:20 +03:00
2009-03-12 22:50:33 +03:00
$(BOOT_TARGETS) : vmlinux
$( Q) $( MAKE) $( build) = $( boot) $@
2008-01-30 15:32:20 +03:00
2009-04-17 21:46:37 +04:00
PHONY += install
install :
$( Q) $( MAKE) $( build) = $( boot) $@
2008-01-30 15:32:20 +03:00
PHONY += vdso_install
vdso_install :
2015-06-03 19:05:44 +03:00
$( Q) $( MAKE) $( build) = arch/x86/entry/vdso $@
2008-01-30 15:32:20 +03:00
archclean :
$( Q) rm -rf $( objtree) /arch/i386
$( Q) rm -rf $( objtree) /arch/x86_64
$( Q) $( MAKE) $( clean) = $( boot)
2012-05-21 21:51:24 +04:00
$( Q) $( MAKE) $( clean) = arch/x86/tools
2014-08-30 02:19:13 +04:00
$( Q) $( MAKE) $( clean) = arch/x86/purgatory
2008-01-30 15:32:20 +03:00
d e f i n e a r c h h e l p
2008-01-30 15:32:49 +03:00
echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)'
echo ' install - Install kernel using'
2009-07-20 23:37:11 +04:00
echo ' (your) ~/bin/$(INSTALLKERNEL) or'
echo ' (distribution) /sbin/$(INSTALLKERNEL) or'
2008-01-30 15:32:49 +03:00
echo ' install to $$(INSTALL_PATH) and run lilo'
echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)'
echo ' fdimage288 - Create 2.8MB boot floppy image (arch/x86/boot/fdimage)'
echo ' isoimage - Create a boot CD-ROM image (arch/x86/boot/image.iso)'
echo ' bzdisk/fdimage*/isoimage also accept:'
echo ' FDARGS="..." arguments for the booted kernel'
echo ' FDINITRD=file initrd for the booted kernel'
2008-01-30 15:32:20 +03:00
e n d e f