Files
fzf/src/algo/indexbyte2_arm64.s
Junegunn Choi f3ca0b1365
Some checks failed
CodeQL / Analyze (go) (push) Has been cancelled
build / build (push) Has been cancelled
Test fzf on macOS / build (push) Has been cancelled
Fix OSC8 hyperlinks mangled when URL contains unicode
Fix #4707
2026-03-08 13:55:14 +09:00

250 lines
6.2 KiB
ArmAsm

#include "textflag.h"
// func IndexByteTwo(s []byte, b1, b2 byte) int
//
// Returns the index of the first occurrence of b1 or b2 in s, or -1.
// Uses ARM64 NEON to search for both bytes in a single pass over the data.
// Adapted from Go's internal/bytealg/indexbyte_arm64.s (single-byte version).
TEXT ·IndexByteTwo(SB),NOSPLIT,$0-40
MOVD s_base+0(FP), R0
MOVD s_len+8(FP), R2
MOVBU b1+24(FP), R1
MOVBU b2+25(FP), R7
MOVD $ret+32(FP), R8
// Core algorithm:
// For each 32-byte chunk we calculate a 64-bit syndrome value,
// with two bits per byte. We compare against both b1 and b2,
// OR the results, then use the same syndrome extraction as
// Go's IndexByte.
CBZ R2, fail
MOVD R0, R11
// Magic constant 0x40100401 allows us to identify which lane matches.
// Each byte in the group of 4 gets a distinct bit: 1, 4, 16, 64.
MOVD $0x40100401, R5
VMOV R1, V0.B16 // V0 = splat(b1)
VMOV R7, V7.B16 // V7 = splat(b2)
// Work with aligned 32-byte chunks
BIC $0x1f, R0, R3
VMOV R5, V5.S4
ANDS $0x1f, R0, R9
AND $0x1f, R2, R10
BEQ loop
// Input string is not 32-byte aligned. Process the first
// aligned 32-byte block and mask off bytes before our start.
VLD1.P (R3), [V1.B16, V2.B16]
SUB $0x20, R9, R4
ADDS R4, R2, R2
// Compare against both needles
VCMEQ V0.B16, V1.B16, V3.B16 // b1 vs first 16 bytes
VCMEQ V7.B16, V1.B16, V8.B16 // b2 vs first 16 bytes
VORR V8.B16, V3.B16, V3.B16 // combine
VCMEQ V0.B16, V2.B16, V4.B16 // b1 vs second 16 bytes
VCMEQ V7.B16, V2.B16, V9.B16 // b2 vs second 16 bytes
VORR V9.B16, V4.B16, V4.B16 // combine
// Build syndrome
VAND V5.B16, V3.B16, V3.B16
VAND V5.B16, V4.B16, V4.B16
VADDP V4.B16, V3.B16, V6.B16
VADDP V6.B16, V6.B16, V6.B16
VMOV V6.D[0], R6
// Clear the irrelevant lower bits
LSL $1, R9, R4
LSR R4, R6, R6
LSL R4, R6, R6
// The first block can also be the last
BLS masklast
// Have we found something already?
CBNZ R6, tail
loop:
VLD1.P (R3), [V1.B16, V2.B16]
SUBS $0x20, R2, R2
// Compare against both needles, OR results
VCMEQ V0.B16, V1.B16, V3.B16
VCMEQ V7.B16, V1.B16, V8.B16
VORR V8.B16, V3.B16, V3.B16
VCMEQ V0.B16, V2.B16, V4.B16
VCMEQ V7.B16, V2.B16, V9.B16
VORR V9.B16, V4.B16, V4.B16
// If we're out of data we finish regardless of the result
BLS end
// Fast check: OR both halves and check for any match
VORR V4.B16, V3.B16, V6.B16
VADDP V6.D2, V6.D2, V6.D2
VMOV V6.D[0], R6
CBZ R6, loop
end:
// Found something or out of data build full syndrome
VAND V5.B16, V3.B16, V3.B16
VAND V5.B16, V4.B16, V4.B16
VADDP V4.B16, V3.B16, V6.B16
VADDP V6.B16, V6.B16, V6.B16
VMOV V6.D[0], R6
// Only mask for the last block
BHS tail
masklast:
// Clear irrelevant upper bits
ADD R9, R10, R4
AND $0x1f, R4, R4
SUB $0x20, R4, R4
NEG R4<<1, R4
LSL R4, R6, R6
LSR R4, R6, R6
tail:
CBZ R6, fail
RBIT R6, R6
SUB $0x20, R3, R3
CLZ R6, R6
ADD R6>>1, R3, R0
SUB R11, R0, R0
MOVD R0, (R8)
RET
fail:
MOVD $-1, R0
MOVD R0, (R8)
RET
// func lastIndexByteTwo(s []byte, b1, b2 byte) int
//
// Returns the index of the last occurrence of b1 or b2 in s, or -1.
// Scans backward using ARM64 NEON.
TEXT ·lastIndexByteTwo(SB),NOSPLIT,$0-40
MOVD s_base+0(FP), R0
MOVD s_len+8(FP), R2
MOVBU b1+24(FP), R1
MOVBU b2+25(FP), R7
MOVD $ret+32(FP), R8
CBZ R2, lfail
MOVD R0, R11 // save base
ADD R0, R2, R12 // R12 = end = base + len
MOVD $0x40100401, R5
VMOV R1, V0.B16 // V0 = splat(b1)
VMOV R7, V7.B16 // V7 = splat(b2)
VMOV R5, V5.S4
// Align: find the aligned block containing the last byte
SUB $1, R12, R3
BIC $0x1f, R3, R3 // R3 = start of aligned block containing last byte
// --- Process tail block ---
VLD1 (R3), [V1.B16, V2.B16]
VCMEQ V0.B16, V1.B16, V3.B16
VCMEQ V7.B16, V1.B16, V8.B16
VORR V8.B16, V3.B16, V3.B16
VCMEQ V0.B16, V2.B16, V4.B16
VCMEQ V7.B16, V2.B16, V9.B16
VORR V9.B16, V4.B16, V4.B16
VAND V5.B16, V3.B16, V3.B16
VAND V5.B16, V4.B16, V4.B16
VADDP V4.B16, V3.B16, V6.B16
VADDP V6.B16, V6.B16, V6.B16
VMOV V6.D[0], R6
// Mask upper bits (bytes past end of slice)
// tail_bytes = end - R3 (1..32)
SUB R3, R12, R10 // R10 = tail_bytes
MOVD $64, R4
SUB R10<<1, R4, R4 // R4 = 64 - 2*tail_bytes
LSL R4, R6, R6
LSR R4, R6, R6
// Is this also the head block?
CMP R11, R3 // R3 - R11
BLO lmaskfirst // R3 < base: head+tail in same block
BEQ ltailonly // R3 == base: single aligned block
// R3 > base: more blocks before this one
CBNZ R6, llast
B lbacksetup
ltailonly:
// Single block, already masked upper bits
CBNZ R6, llast
B lfail
lmaskfirst:
// Mask lower bits (bytes before start of slice)
SUB R3, R11, R4 // R4 = base - R3
LSL $1, R4, R4
LSR R4, R6, R6
LSL R4, R6, R6
CBNZ R6, llast
B lfail
lbacksetup:
SUB $0x20, R3
lbackloop:
VLD1 (R3), [V1.B16, V2.B16]
VCMEQ V0.B16, V1.B16, V3.B16
VCMEQ V7.B16, V1.B16, V8.B16
VORR V8.B16, V3.B16, V3.B16
VCMEQ V0.B16, V2.B16, V4.B16
VCMEQ V7.B16, V2.B16, V9.B16
VORR V9.B16, V4.B16, V4.B16
// Quick check: any match in this block?
VORR V4.B16, V3.B16, V6.B16
VADDP V6.D2, V6.D2, V6.D2
VMOV V6.D[0], R6
// Is this a head block? (R3 < base)
CMP R11, R3
BLO lheadblock
// Full block (R3 >= base)
CBNZ R6, lbackfound
// More blocks?
BEQ lfail // R3 == base, no more
SUB $0x20, R3
B lbackloop
lbackfound:
// Build full syndrome
VAND V5.B16, V3.B16, V3.B16
VAND V5.B16, V4.B16, V4.B16
VADDP V4.B16, V3.B16, V6.B16
VADDP V6.B16, V6.B16, V6.B16
VMOV V6.D[0], R6
B llast
lheadblock:
// R3 < base. Build full syndrome if quick check had a match.
CBZ R6, lfail
VAND V5.B16, V3.B16, V3.B16
VAND V5.B16, V4.B16, V4.B16
VADDP V4.B16, V3.B16, V6.B16
VADDP V6.B16, V6.B16, V6.B16
VMOV V6.D[0], R6
// Mask lower bits
SUB R3, R11, R4 // R4 = base - R3
LSL $1, R4, R4
LSR R4, R6, R6
LSL R4, R6, R6
CBZ R6, lfail
llast:
// Find last match: highest set bit in syndrome
// Syndrome has bit 2i set for matching byte i.
// CLZ gives leading zeros; byte_offset = (63 - CLZ) / 2.
CLZ R6, R6
MOVD $63, R4
SUB R6, R4, R6 // R6 = 63 - CLZ = bit position
LSR $1, R6 // R6 = byte offset within block
ADD R3, R6, R0 // R0 = absolute address
SUB R11, R0, R0 // R0 = slice index
MOVD R0, (R8)
RET
lfail:
MOVD $-1, R0
MOVD R0, (R8)
RET