Use a shared work queue instead of static partitioning in matcher

Replace static chunk partitioning (sliceChunks) with a shared atomic
counter that workers pull from. This gives natural load balancing;
workers that finish chunks quickly grab more work instead of idling.

With this change, NumCPU workers suffice (no need for 8x oversubscription),
reducing goroutine overhead while improving throughput by 5-22%.

Now the performance scales linearly to the number of threads:

=== query: 'linux' ===
  [all]   baseline:    17.12ms  current:    14.28ms  (1.20x)  matches: 179966 (12.79%)
  [1T]    baseline:   136.49ms  current:   137.25ms  (0.99x)  matches: 179966 (12.79%)
  [2T]    baseline:    75.74ms  current:    68.75ms  (1.10x)  matches: 179966 (12.79%)
  [4T]    baseline:    41.16ms  current:    34.97ms  (1.18x)  matches: 179966 (12.79%)
  [8T]    baseline:    32.82ms  current:    17.79ms  (1.84x)  matches: 179966 (12.79%)
This commit is contained in:
Junegunn Choi
2026-03-07 18:00:23 +09:00
parent 92dc40ea82
commit 92bfe68c74
2 changed files with 23 additions and 47 deletions

View File

@@ -34,9 +34,7 @@ const (
maxBgProcessesPerAction = 3
// Matcher
numPartitionsMultiplier = 8
maxPartitions = 32
progressMinDuration = 200 * time.Millisecond
progressMinDuration = 200 * time.Millisecond
// Capacity of each chunk
chunkSize int = 1000

View File

@@ -4,6 +4,7 @@ import (
"fmt"
"runtime"
"sync"
"sync/atomic"
"time"
"github.com/junegunn/fzf/src/util"
@@ -57,7 +58,7 @@ const (
// NewMatcher returns a new Matcher
func NewMatcher(cache *ChunkCache, patternBuilder func([]rune) *Pattern,
sort bool, tac bool, eventBox *util.EventBox, revision revision, threads int) *Matcher {
partitions := min(numPartitionsMultiplier*runtime.NumCPU(), maxPartitions)
partitions := runtime.NumCPU()
if threads > 0 {
partitions = threads
}
@@ -148,27 +149,6 @@ func (m *Matcher) Loop() {
}
}
func (m *Matcher) sliceChunks(chunks []*Chunk) [][]*Chunk {
partitions := m.partitions
perSlice := len(chunks) / partitions
if perSlice == 0 {
partitions = len(chunks)
perSlice = 1
}
slices := make([][]*Chunk, partitions)
for i := 0; i < partitions; i++ {
start := i * perSlice
end := start + perSlice
if i == partitions-1 {
end = len(chunks)
}
slices[i] = chunks[start:end]
}
return slices
}
type partialResult struct {
index int
matches []Result
@@ -192,39 +172,37 @@ func (m *Matcher) scan(request MatchRequest) MatchResult {
maxIndex := request.chunks[numChunks-1].lastIndex(minIndex)
cancelled := util.NewAtomicBool(false)
slices := m.sliceChunks(request.chunks)
numSlices := len(slices)
resultChan := make(chan partialResult, numSlices)
numWorkers := min(m.partitions, numChunks)
var nextChunk atomic.Int32
resultChan := make(chan partialResult, numWorkers)
countChan := make(chan int, numChunks)
waitGroup := sync.WaitGroup{}
for idx, chunks := range slices {
for idx := range numWorkers {
waitGroup.Add(1)
if m.slab[idx] == nil {
m.slab[idx] = util.MakeSlab(slab16Size, slab32Size)
}
go func(idx int, slab *util.Slab, chunks []*Chunk) {
defer func() { waitGroup.Done() }()
count := 0
allMatches := make([][]Result, len(chunks))
for idx, chunk := range chunks {
matches := request.pattern.Match(chunk, slab)
allMatches[idx] = matches
count += len(matches)
go func(idx int, slab *util.Slab) {
defer waitGroup.Done()
var matches []Result
for {
ci := int(nextChunk.Add(1)) - 1
if ci >= numChunks {
break
}
chunkMatches := request.pattern.Match(request.chunks[ci], slab)
matches = append(matches, chunkMatches...)
if cancelled.Get() {
return
}
countChan <- len(matches)
}
sliceMatches := make([]Result, 0, count)
for _, matches := range allMatches {
sliceMatches = append(sliceMatches, matches...)
countChan <- len(chunkMatches)
}
if m.sort && request.pattern.sortable {
m.sortBuf[idx] = radixSortResults(sliceMatches, m.tac, m.sortBuf[idx])
m.sortBuf[idx] = radixSortResults(matches, m.tac, m.sortBuf[idx])
}
resultChan <- partialResult{idx, sliceMatches}
}(idx, m.slab[idx], chunks)
resultChan <- partialResult{idx, matches}
}(idx, m.slab[idx])
}
wait := func() bool {
@@ -252,8 +230,8 @@ func (m *Matcher) scan(request MatchRequest) MatchResult {
}
}
partialResults := make([][]Result, numSlices)
for range slices {
partialResults := make([][]Result, numWorkers)
for range numWorkers {
partialResult := <-resultChan
partialResults[partialResult.index] = partialResult.matches
}