Revision 2a18da7a9c7886f1c7307f8d3f23f24318583f03 authored by George Spelvin on 23 May 2016, 11:43:58 UTC, committed by George Spelvin on 28 May 2016, 19:45:29 UTC
Patch 0fed3ac866 improved the hash mixing, but the function is slower
than necessary; there's a 7-instruction dependency chain (10 on x86)
each loop iteration.

Word-at-a-time access is a very tight loop (which is good, because
link_path_walk() is one of the hottest code paths in the entire kernel),
and the hash mixing function must not have a longer latency to avoid
slowing it down.

There do not appear to be any published fast hash functions that:
1) Operate on the input a word at a time, and
2) Don't need to know the length of the input beforehand, and
3) Have a single iterated mixing function, not needing conditional
   branches or unrolling to distinguish different loop iterations.

One of the algorithms which comes closest is Yann Collet's xxHash, but
that's two dependent multiplies per word, which is too much.

The key insights in this design are:

1) Barring expensive ops like multiplies, to diffuse one input bit
   across 64 bits of hash state takes at least log2(64) = 6 sequentially
   dependent instructions.  That is more cycles than we'd like.
2) An operation like "hash ^= hash << 13" requires a second temporary
   register anyway, and on a 2-operand machine like x86, it's three
   instructions.
3) A better use of a second register is to hold a two-word hash state.
   With careful design, no temporaries are needed at all, so it doesn't
   increase register pressure.  And this gets rid of register copying
   on 2-operand machines, so the code is smaller and faster.
4) Using two words of state weakens the requirement for one-round mixing;
   we now have two rounds of mixing before cancellation is possible.
5) A two-word hash state also allows operations on both halves to be
   done in parallel, so on a superscalar processor we get more mixing
   in fewer cycles.

I ended up using a mixing function inspired by the ChaCha and Speck
round functions.  It is 6 simple instructions and 3 cycles per iteration
(assuming multiply by 9 can be done by an "lea" instruction):

		x ^= *input++;
	y ^= x;	x = ROL(x, K1);
	x += y;	y = ROL(y, K2);
	y *= 9;

Not only is this reversible, two consecutive rounds are reversible:
if you are given the initial and final states, but not the intermediate
state, it is possible to compute both input words.  This means that at
least 3 words of input are required to create a collision.

(It also has the property, used by hash_name() to avoid a branch, that
it hashes all-zero to all-zero.)

The rotate constants K1 and K2 were found by experiment.  The search took
a sample of random initial states (I used 1023) and considered the effect
of flipping each of the 64 input bits on each of the 128 output bits two
rounds later.  Each of the 8192 pairs can be considered a biased coin, and
adding up the Shannon entropy of all of them produces a score.

The best-scoring shifts also did well in other tests (flipping bits in y,
trying 3 or 4 rounds of mixing, flipping all 64*63/2 pairs of input bits),
so the choice was made with the additional constraint that the sum of the
shifts is odd and not too close to the word size.

The final state is then folded into a 32-bit hash value by a less carefully
optimized multiply-based scheme.  This also has to be fast, as pathname
components tend to be short (the most common case is one iteration!), but
there's some room for latency, as there is a fair bit of intervening logic
before the hash value is used for anything.

(Performance verified with "bonnie++ -s 0 -n 1536:-2" on tmpfs.  I need
a better benchmark; the numbers seem to show a slight dip in performance
between 4.6.0 and this patch, but they're too noisy to quote.)

Special thanks to Bruce fields for diligent testing which uncovered a
nasty fencepost error in an earlier version of this patch.

[checkpatch.pl formatting complaints noted and respectfully disagreed with.]

Signed-off-by: George Spelvin <linux@sciencehorizons.net>
Tested-by: J. Bruce Fields <bfields@redhat.com>
1 parent ef703f4
Raw File
Kconfig
#
# File system configuration
#

menu "File systems"

# Use unaligned word dcache accesses
config DCACHE_WORD_ACCESS
       bool

if BLOCK

source "fs/ext2/Kconfig"
source "fs/ext4/Kconfig"
source "fs/jbd2/Kconfig"

config FS_MBCACHE
# Meta block cache for Extended Attributes (ext2/ext3/ext4)
	tristate
	default y if EXT2_FS=y && EXT2_FS_XATTR
	default y if EXT4_FS=y
	default m if EXT2_FS_XATTR || EXT4_FS

source "fs/reiserfs/Kconfig"
source "fs/jfs/Kconfig"

source "fs/xfs/Kconfig"
source "fs/gfs2/Kconfig"
source "fs/ocfs2/Kconfig"
source "fs/btrfs/Kconfig"
source "fs/nilfs2/Kconfig"
source "fs/f2fs/Kconfig"

config FS_DAX
	bool "Direct Access (DAX) support"
	depends on MMU
	depends on !(ARM || MIPS || SPARC)
	help
	  Direct Access (DAX) can be used on memory-backed block devices.
	  If the block device supports DAX and the filesystem supports DAX,
	  then you can avoid using the pagecache to buffer I/Os.  Turning
	  on this option will compile in support for DAX; you will need to
	  mount the filesystem using the -o dax option.

	  If you do not have a block device that is capable of using this,
	  or if unsure, say N.  Saying Y will increase the size of the kernel
	  by about 5kB.

config FS_DAX_PMD
	bool
	default FS_DAX
	depends on FS_DAX
	depends on ZONE_DEVICE
	depends on TRANSPARENT_HUGEPAGE

endif # BLOCK

# Posix ACL utility routines
#
# Note: Posix ACLs can be implemented without these helpers.  Never use
# this symbol for ifdefs in core code.
#
config FS_POSIX_ACL
	def_bool n

config EXPORTFS
	tristate

config FILE_LOCKING
	bool "Enable POSIX file locking API" if EXPERT
	default y
	help
	  This option enables standard file locking support, required
          for filesystems like NFS and for the flock() system
          call. Disabling this option saves about 11k.

config MANDATORY_FILE_LOCKING
	bool "Enable Mandatory file locking"
	depends on FILE_LOCKING
	default y
	help
	  This option enables files appropriately marked files on appropriely
	  mounted filesystems to support mandatory locking.

	  To the best of my knowledge this is dead code that no one cares about.

source "fs/crypto/Kconfig"

source "fs/notify/Kconfig"

source "fs/quota/Kconfig"

source "fs/autofs4/Kconfig"
source "fs/fuse/Kconfig"
source "fs/overlayfs/Kconfig"

menu "Caches"

source "fs/fscache/Kconfig"
source "fs/cachefiles/Kconfig"

endmenu

if BLOCK
menu "CD-ROM/DVD Filesystems"

source "fs/isofs/Kconfig"
source "fs/udf/Kconfig"

endmenu
endif # BLOCK

if BLOCK
menu "DOS/FAT/NT Filesystems"

source "fs/fat/Kconfig"
source "fs/ntfs/Kconfig"

endmenu
endif # BLOCK

menu "Pseudo filesystems"

source "fs/proc/Kconfig"
source "fs/kernfs/Kconfig"
source "fs/sysfs/Kconfig"

config TMPFS
	bool "Tmpfs virtual memory file system support (former shm fs)"
	depends on SHMEM
	help
	  Tmpfs is a file system which keeps all files in virtual memory.

	  Everything in tmpfs is temporary in the sense that no files will be
	  created on your hard drive. The files live in memory and swap
	  space. If you unmount a tmpfs instance, everything stored therein is
	  lost.

	  See <file:Documentation/filesystems/tmpfs.txt> for details.

config TMPFS_POSIX_ACL
	bool "Tmpfs POSIX Access Control Lists"
	depends on TMPFS
	select TMPFS_XATTR
	select FS_POSIX_ACL
	help
	  POSIX Access Control Lists (ACLs) support additional access rights
	  for users and groups beyond the standard owner/group/world scheme,
	  and this option selects support for ACLs specifically for tmpfs
	  filesystems.

	  If you've selected TMPFS, it's possible that you'll also need
	  this option as there are a number of Linux distros that require
	  POSIX ACL support under /dev for certain features to work properly.
	  For example, some distros need this feature for ALSA-related /dev
	  files for sound to work properly.  In short, if you're not sure,
	  say Y.

	  To learn more about Access Control Lists, visit the POSIX ACLs for
	  Linux website <http://acl.bestbits.at/>.

config TMPFS_XATTR
	bool "Tmpfs extended attributes"
	depends on TMPFS
	default n
	help
	  Extended attributes are name:value pairs associated with inodes by
	  the kernel or by users (see the attr(5) manual page, or visit
	  <http://acl.bestbits.at/> for details).

	  Currently this enables support for the trusted.* and
	  security.* namespaces.

	  You need this for POSIX ACL support on tmpfs.

	  If unsure, say N.

config HUGETLBFS
	bool "HugeTLB file system support"
	depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \
		   SYS_SUPPORTS_HUGETLBFS || BROKEN
	help
	  hugetlbfs is a filesystem backing for HugeTLB pages, based on
	  ramfs. For architectures that support it, say Y here and read
	  <file:Documentation/vm/hugetlbpage.txt> for details.

	  If unsure, say N.

config HUGETLB_PAGE
	def_bool HUGETLBFS

source "fs/configfs/Kconfig"
source "fs/efivarfs/Kconfig"

endmenu

menuconfig MISC_FILESYSTEMS
	bool "Miscellaneous filesystems"
	default y
	---help---
	  Say Y here to get to see options for various miscellaneous
	  filesystems, such as filesystems that came from other
	  operating systems.

	  This option alone does not add any kernel code.

	  If you say N, all options in this submenu will be skipped and
	  disabled; if unsure, say Y here.

if MISC_FILESYSTEMS

source "fs/orangefs/Kconfig"
source "fs/adfs/Kconfig"
source "fs/affs/Kconfig"
source "fs/ecryptfs/Kconfig"
source "fs/hfs/Kconfig"
source "fs/hfsplus/Kconfig"
source "fs/befs/Kconfig"
source "fs/bfs/Kconfig"
source "fs/efs/Kconfig"
source "fs/jffs2/Kconfig"
# UBIFS File system configuration
source "fs/ubifs/Kconfig"
source "fs/logfs/Kconfig"
source "fs/cramfs/Kconfig"
source "fs/squashfs/Kconfig"
source "fs/freevxfs/Kconfig"
source "fs/minix/Kconfig"
source "fs/omfs/Kconfig"
source "fs/hpfs/Kconfig"
source "fs/qnx4/Kconfig"
source "fs/qnx6/Kconfig"
source "fs/romfs/Kconfig"
source "fs/pstore/Kconfig"
source "fs/sysv/Kconfig"
source "fs/ufs/Kconfig"
source "fs/exofs/Kconfig"

endif # MISC_FILESYSTEMS

source "fs/exofs/Kconfig.ore"

menuconfig NETWORK_FILESYSTEMS
	bool "Network File Systems"
	default y
	depends on NET
	---help---
	  Say Y here to get to see options for network filesystems and
	  filesystem-related networking code, such as NFS daemon and
	  RPCSEC security modules.

	  This option alone does not add any kernel code.

	  If you say N, all options in this submenu will be skipped and
	  disabled; if unsure, say Y here.

if NETWORK_FILESYSTEMS

source "fs/nfs/Kconfig"
source "fs/nfsd/Kconfig"

config GRACE_PERIOD
	tristate

config LOCKD
	tristate
	depends on FILE_LOCKING
	select GRACE_PERIOD

config LOCKD_V4
	bool
	depends on NFSD_V3 || NFS_V3
	depends on FILE_LOCKING
	default y

config NFS_ACL_SUPPORT
	tristate
	select FS_POSIX_ACL

config NFS_COMMON
	bool
	depends on NFSD || NFS_FS || LOCKD
	default y

source "net/sunrpc/Kconfig"
source "fs/ceph/Kconfig"
source "fs/cifs/Kconfig"
source "fs/ncpfs/Kconfig"
source "fs/coda/Kconfig"
source "fs/afs/Kconfig"
source "fs/9p/Kconfig"

endif # NETWORK_FILESYSTEMS

source "fs/nls/Kconfig"
source "fs/dlm/Kconfig"

endmenu
back to top