Use LSD radix sort for Result sorting in matcher

Replace comparison-based pdqsort with LSD radix sort on the uint64 sort key. Radix sort is O(n) vs O(n log n) and avoids pointer-chasing cache misses in the comparison function. Sort scratch buffer is reused across iterations to reduce GC pressure. Benchmark (single-threaded, Chromium file list): - linux query (180K matches): ~16% faster - src query (high match count): ~31% faster - Rare matches: equivalent (falls back to pdqsort for n < 128)
Add direct algo fast path in matchChunk
2026-04-14 19:59:44 +08:00 · 2026-03-01 11:58:16 +09:00 · 2026-03-01 11:58:16 +09:00 · 2026-03-01 11:18:43 +09:00
7 changed files with 219 additions and 28 deletions
--- a/src/core.go
+++ b/src/core.go
@@ -260,7 +260,7 @@ func Run(opts *Options) (int, error) {
 							return false
 						}
 						mutex.Lock()
-						if result, _, _ := pattern.MatchItem(&item, false, slab); result != nil {
+						if result, _, _ := pattern.MatchItem(&item, false, slab); result.item != nil {
 							opts.Printer(transformer(&item))
 							found = true
 						}
--- a/src/matcher.go
+++ b/src/matcher.go
@@ -3,7 +3,6 @@ package fzf
 import (
 	"fmt"
 	"runtime"
-	"sort"
 	"sync"
 	"time"

@@ -43,6 +42,7 @@ type Matcher struct {
 	reqBox         *util.EventBox
 	partitions     int
 	slab           []*util.Slab
+	sortBuf        [][]Result
 	mergerCache    map[string]MatchResult
 	revision       revision
 }
@@ -68,6 +68,7 @@ func NewMatcher(cache *ChunkCache, patternBuilder func([]rune) *Pattern,
 		reqBox:         util.NewEventBox(),
 		partitions:     partitions,
 		slab:           make([]*util.Slab, partitions),
+		sortBuf:        make([][]Result, partitions),
 		mergerCache:    make(map[string]MatchResult),
 		revision:       revision}
 }
@@ -215,11 +216,7 @@ func (m *Matcher) scan(request MatchRequest) MatchResult {
 				sliceMatches = append(sliceMatches, matches...)
 			}
 			if m.sort && request.pattern.sortable {
-				if m.tac {
-					sort.Sort(ByRelevanceTac(sliceMatches))
-				} else {
-					sort.Sort(ByRelevance(sliceMatches))
-				}
+				m.sortBuf[idx] = radixSortResults(sliceMatches, m.tac, m.sortBuf[idx])
 			}
 			resultChan <- partialResult{idx, sliceMatches}
 		}(idx, m.slab[idx], chunks)
--- a/src/pattern.go
+++ b/src/pattern.go
@@ -65,6 +65,8 @@ type Pattern struct {
 	cache         *ChunkCache
 	denylist      map[int32]struct{}
 	startIndex    int32
+	directAlgo    algo.Algo
+	directTerm    *term
 }

 var _splitRegex *regexp.Regexp
@@ -151,6 +153,7 @@ func BuildPattern(cache *ChunkCache, patternCache map[string]*Pattern, fuzzy boo
 		procFun:       make(map[termType]algo.Algo)}

 	ptr.cacheKey = ptr.buildCacheKey()
+	ptr.directAlgo, ptr.directTerm = ptr.buildDirectAlgo(fuzzyAlgo)
 	ptr.procFun[termFuzzy] = fuzzyAlgo
 	ptr.procFun[termEqual] = algo.EqualMatch
 	ptr.procFun[termExact] = algo.ExactMatchNaive
@@ -274,6 +277,22 @@ func (p *Pattern) buildCacheKey() string {
 	return strings.Join(cacheableTerms, "\t")
 }

+// buildDirectAlgo returns the algo function and term for the direct fast path
+// in matchChunk. Returns (nil, nil) if the pattern is not suitable.
+// Requirements: extended mode, single term set with single non-inverse fuzzy term, no nth.
+func (p *Pattern) buildDirectAlgo(fuzzyAlgo algo.Algo) (algo.Algo, *term) {
+	if !p.extended || len(p.nth) > 0 {
+		return nil, nil
+	}
+	if len(p.termSets) == 1 && len(p.termSets[0]) == 1 {
+		t := &p.termSets[0][0]
+		if !t.inv && t.typ == termFuzzy {
+			return fuzzyAlgo, t
+		}
+	}
+	return nil, nil
+}
+
 // CacheKey is used to build string to be used as the key of result cache
 func (p *Pattern) CacheKey() string {
 	return p.cacheKey
@@ -312,18 +331,47 @@ func (p *Pattern) matchChunk(chunk *Chunk, space []Result, slab *util.Slab) []Re
 		}
 	}

-	if len(p.denylist) == 0 {
-		// Huge code duplication for minimizing unnecessary map lookups
+	// Fast path: single fuzzy term, no nth, no denylist.
+	// Calls the algo function directly, bypassing MatchItem/extendedMatch/iter
+	// and avoiding per-match []Offset heap allocation.
+	if p.directAlgo != nil && len(p.denylist) == 0 {
+		t := p.directTerm
 		if space == nil {
 			for idx := startIdx; idx < chunk.count; idx++ {
-				if match, _, _ := p.MatchItem(&chunk.items[idx], p.withPos, slab); match != nil {
-					matches = append(matches, *match)
+				res, _ := p.directAlgo(t.caseSensitive, t.normalize, p.forward,
+					&chunk.items[idx].text, t.text, p.withPos, slab)
+				if res.Start >= 0 {
+					matches = append(matches, buildResultFromBounds(
+						&chunk.items[idx], res.Score,
+						int(res.Start), int(res.End), int(res.End), true))
 				}
 			}
 		} else {
 			for _, result := range space {
-				if match, _, _ := p.MatchItem(result.item, p.withPos, slab); match != nil {
-					matches = append(matches, *match)
+				res, _ := p.directAlgo(t.caseSensitive, t.normalize, p.forward,
+					&result.item.text, t.text, p.withPos, slab)
+				if res.Start >= 0 {
+					matches = append(matches, buildResultFromBounds(
+						result.item, res.Score,
+						int(res.Start), int(res.End), int(res.End), true))
+				}
+			}
+		}
+		return matches
+	}
+
+	if len(p.denylist) == 0 {
+		// Huge code duplication for minimizing unnecessary map lookups
+		if space == nil {
+			for idx := startIdx; idx < chunk.count; idx++ {
+				if match, _, _ := p.MatchItem(&chunk.items[idx], p.withPos, slab); match.item != nil {
+					matches = append(matches, match)
+				}
+			}
+		} else {
+			for _, result := range space {
+				if match, _, _ := p.MatchItem(result.item, p.withPos, slab); match.item != nil {
+					matches = append(matches, match)
 				}
 			}
 		}
@@ -335,8 +383,8 @@ func (p *Pattern) matchChunk(chunk *Chunk, space []Result, slab *util.Slab) []Re
 			if _, prs := p.denylist[chunk.items[idx].Index()]; prs {
 				continue
 			}
-			if match, _, _ := p.MatchItem(&chunk.items[idx], p.withPos, slab); match != nil {
-				matches = append(matches, *match)
+			if match, _, _ := p.MatchItem(&chunk.items[idx], p.withPos, slab); match.item != nil {
+				matches = append(matches, match)
 			}
 		}
 	} else {
@@ -344,30 +392,29 @@ func (p *Pattern) matchChunk(chunk *Chunk, space []Result, slab *util.Slab) []Re
 			if _, prs := p.denylist[result.item.Index()]; prs {
 				continue
 			}
-			if match, _, _ := p.MatchItem(result.item, p.withPos, slab); match != nil {
-				matches = append(matches, *match)
+			if match, _, _ := p.MatchItem(result.item, p.withPos, slab); match.item != nil {
+				matches = append(matches, match)
 			}
 		}
 	}
 	return matches
 }

-// MatchItem returns true if the Item is a match
-func (p *Pattern) MatchItem(item *Item, withPos bool, slab *util.Slab) (*Result, []Offset, *[]int) {
+// MatchItem returns the match result if the Item is a match.
+// A zero-value Result (with item == nil) indicates no match.
+func (p *Pattern) MatchItem(item *Item, withPos bool, slab *util.Slab) (Result, []Offset, *[]int) {
 	if p.extended {
 		if offsets, bonus, pos := p.extendedMatch(item, withPos, slab); len(offsets) == len(p.termSets) {
-			result := buildResult(item, offsets, bonus)
-			return &result, offsets, pos
+			return buildResult(item, offsets, bonus), offsets, pos
 		}
-		return nil, nil, nil
+		return Result{}, nil, nil
 	}
 	offset, bonus, pos := p.basicMatch(item, withPos, slab)
 	if sidx := offset[0]; sidx >= 0 {
 		offsets := []Offset{offset}
-		result := buildResult(item, offsets, bonus)
-		return &result, offsets, pos
+		return buildResult(item, offsets, bonus), offsets, pos
 	}
-	return nil, nil, nil
+	return Result{}, nil, nil
 }

 func (p *Pattern) basicMatch(item *Item, withPos bool, slab *util.Slab) (Offset, int, *[]int) {
--- a/src/result.go
+++ b/src/result.go
@@ -33,8 +33,6 @@ func buildResult(item *Item, offsets []Offset, score int) Result {
 		sort.Sort(ByOrder(offsets))
 	}

-	result := Result{item: item}
-	numChars := item.text.Length()
 	minBegin := math.MaxUint16
 	minEnd := math.MaxUint16
 	maxEnd := 0
@@ -49,6 +47,14 @@ func buildResult(item *Item, offsets []Offset, score int) Result {
 		}
 	}

+	return buildResultFromBounds(item, score, minBegin, minEnd, maxEnd, validOffsetFound)
+}
+
+// buildResultFromBounds builds a Result from pre-computed offset bounds.
+func buildResultFromBounds(item *Item, score int, minBegin, minEnd, maxEnd int, validOffsetFound bool) Result {
+	result := Result{item: item}
+	numChars := item.text.Length()
+
 	for idx, criterion := range sortCriteria {
 		val := uint16(math.MaxUint16)
 		switch criterion {
@@ -75,7 +81,6 @@ func buildResult(item *Item, offsets []Offset, score int) Result {
 			val = item.TrimLength()
 		case byPathname:
 			if validOffsetFound {
-				// lastDelim := strings.LastIndexByte(item.text.ToString(), '/')
 				lastDelim := -1
 				s := item.text.ToString()
 				for i := len(s) - 1; i >= 0; i-- {
@@ -334,3 +339,79 @@ func (a ByRelevanceTac) Swap(i, j int) {
 func (a ByRelevanceTac) Less(i, j int) bool {
 	return compareRanks(a[i], a[j], true)
 }
+
+// radixSortResults sorts Results by their points key using LSD radix sort.
+// O(n) time complexity vs O(n log n) for comparison sort.
+// The sort is stable, so equal-key items maintain original (item-index) order.
+// For tac mode, runs of equal keys are reversed after sorting.
+func radixSortResults(a []Result, tac bool, scratch []Result) []Result {
+	n := len(a)
+	if n < 128 {
+		if tac {
+			sort.Sort(ByRelevanceTac(a))
+		} else {
+			sort.Sort(ByRelevance(a))
+		}
+		return scratch[:0]
+	}
+
+	if cap(scratch) < n {
+		scratch = make([]Result, n)
+	}
+	buf := scratch[:n]
+	src, dst := a, buf
+	scattered := 0
+
+	for pass := range 8 {
+		shift := uint(pass) * 8
+
+		var count [256]int
+		for i := range src {
+			count[byte(sortKey(&src[i])>>shift)]++
+		}
+
+		// Skip if all items have the same byte value at this position
+		if count[byte(sortKey(&src[0])>>shift)] == n {
+			continue
+		}
+
+		var offset [256]int
+		for i := 1; i < 256; i++ {
+			offset[i] = offset[i-1] + count[i-1]
+		}
+
+		for i := range src {
+			b := byte(sortKey(&src[i]) >> shift)
+			dst[offset[b]] = src[i]
+			offset[b]++
+		}
+
+		src, dst = dst, src
+		scattered++
+	}
+
+	// If odd number of scatters, data is in buf, copy back to a
+	if scattered%2 == 1 {
+		copy(a, src)
+	}
+
+	// Handle tac: reverse runs of equal keys so equal-key items
+	// are in reverse item-index order
+	if tac {
+		i := 0
+		for i < n {
+			ki := sortKey(&a[i])
+			j := i + 1
+			for j < n && sortKey(&a[j]) == ki {
+				j++
+			}
+			if j-i > 1 {
+				for l, r := i, j-1; l < r; l, r = l+1, r-1 {
+					a[l], a[r] = a[r], a[l]
+				}
+			}
+			i = j
+		}
+	}
+	return scratch
+}
--- a/src/result_others.go
+++ b/src/result_others.go
@@ -14,3 +14,7 @@ func compareRanks(irank Result, jrank Result, tac bool) bool {
 	}
 	return (irank.item.Index() <= jrank.item.Index()) != tac
 }
+
+func sortKey(r *Result) uint64 {
+	return uint64(r.points[0]) | uint64(r.points[1])<<16 | uint64(r.points[2])<<32 | uint64(r.points[3])<<48
+}
--- a/src/result_test.go
+++ b/src/result_test.go
@@ -2,6 +2,7 @@ package fzf

 import (
 	"math"
+	"math/rand"
 	"sort"
 	"testing"

@@ -182,3 +183,60 @@ func TestColorOffset(t *testing.T) {
 		assert(11, 39, 40, tui.NewColorPair(4, 8, tui.Bold))
 	}
 }
+
+func TestRadixSortResults(t *testing.T) {
+	sortCriteria = []criterion{byScore, byLength}
+
+	rng := rand.New(rand.NewSource(42))
+
+	for _, n := range []int{128, 256, 500, 1000} {
+		for _, tac := range []bool{false, true} {
+			// Build items with random points and indices
+			items := make([]*Item, n)
+			for i := range items {
+				items[i] = &Item{text: util.Chars{Index: int32(i)}}
+			}
+
+			results := make([]Result, n)
+			for i := range results {
+				results[i] = Result{
+					item: items[i],
+					points: [4]uint16{
+						uint16(rng.Intn(256)),
+						uint16(rng.Intn(256)),
+						uint16(rng.Intn(256)),
+						uint16(rng.Intn(256)),
+					},
+				}
+			}
+
+			// Make some duplicates to test stability
+			for i := 0; i < n/4; i++ {
+				j := rng.Intn(n)
+				k := rng.Intn(n)
+				results[j].points = results[k].points
+			}
+
+			// Copy for reference sort
+			expected := make([]Result, n)
+			copy(expected, results)
+			if tac {
+				sort.Sort(ByRelevanceTac(expected))
+			} else {
+				sort.Sort(ByRelevance(expected))
+			}
+
+			// Radix sort
+			var scratch []Result
+			scratch = radixSortResults(results, tac, scratch)
+
+			for i := range results {
+				if results[i] != expected[i] {
+					t.Errorf("n=%d tac=%v: mismatch at index %d: got item %d, want item %d",
+						n, tac, i, results[i].item.Index(), expected[i].item.Index())
+					break
+				}
+			}
+		}
+	}
+}
--- a/src/result_x86.go
+++ b/src/result_x86.go
@@ -14,3 +14,7 @@ func compareRanks(irank Result, jrank Result, tac bool) bool {
 	}
 	return (irank.item.Index() <= jrank.item.Index()) != tac
 }
+
+func sortKey(r *Result) uint64 {
+	return *(*uint64)(unsafe.Pointer(&r.points[0]))
+}