mirror of
https://github.com/junegunn/fzf.git
synced 2026-03-13 02:10:51 +08:00
250 lines
6.2 KiB
ArmAsm
250 lines
6.2 KiB
ArmAsm
#include "textflag.h"
|
|
|
|
// func IndexByteTwo(s []byte, b1, b2 byte) int
|
|
//
|
|
// Returns the index of the first occurrence of b1 or b2 in s, or -1.
|
|
// Uses ARM64 NEON to search for both bytes in a single pass over the data.
|
|
// Adapted from Go's internal/bytealg/indexbyte_arm64.s (single-byte version).
|
|
TEXT ·IndexByteTwo(SB),NOSPLIT,$0-40
|
|
MOVD s_base+0(FP), R0
|
|
MOVD s_len+8(FP), R2
|
|
MOVBU b1+24(FP), R1
|
|
MOVBU b2+25(FP), R7
|
|
MOVD $ret+32(FP), R8
|
|
|
|
// Core algorithm:
|
|
// For each 32-byte chunk we calculate a 64-bit syndrome value,
|
|
// with two bits per byte. We compare against both b1 and b2,
|
|
// OR the results, then use the same syndrome extraction as
|
|
// Go's IndexByte.
|
|
|
|
CBZ R2, fail
|
|
MOVD R0, R11
|
|
// Magic constant 0x40100401 allows us to identify which lane matches.
|
|
// Each byte in the group of 4 gets a distinct bit: 1, 4, 16, 64.
|
|
MOVD $0x40100401, R5
|
|
VMOV R1, V0.B16 // V0 = splat(b1)
|
|
VMOV R7, V7.B16 // V7 = splat(b2)
|
|
// Work with aligned 32-byte chunks
|
|
BIC $0x1f, R0, R3
|
|
VMOV R5, V5.S4
|
|
ANDS $0x1f, R0, R9
|
|
AND $0x1f, R2, R10
|
|
BEQ loop
|
|
|
|
// Input string is not 32-byte aligned. Process the first
|
|
// aligned 32-byte block and mask off bytes before our start.
|
|
VLD1.P (R3), [V1.B16, V2.B16]
|
|
SUB $0x20, R9, R4
|
|
ADDS R4, R2, R2
|
|
// Compare against both needles
|
|
VCMEQ V0.B16, V1.B16, V3.B16 // b1 vs first 16 bytes
|
|
VCMEQ V7.B16, V1.B16, V8.B16 // b2 vs first 16 bytes
|
|
VORR V8.B16, V3.B16, V3.B16 // combine
|
|
VCMEQ V0.B16, V2.B16, V4.B16 // b1 vs second 16 bytes
|
|
VCMEQ V7.B16, V2.B16, V9.B16 // b2 vs second 16 bytes
|
|
VORR V9.B16, V4.B16, V4.B16 // combine
|
|
// Build syndrome
|
|
VAND V5.B16, V3.B16, V3.B16
|
|
VAND V5.B16, V4.B16, V4.B16
|
|
VADDP V4.B16, V3.B16, V6.B16
|
|
VADDP V6.B16, V6.B16, V6.B16
|
|
VMOV V6.D[0], R6
|
|
// Clear the irrelevant lower bits
|
|
LSL $1, R9, R4
|
|
LSR R4, R6, R6
|
|
LSL R4, R6, R6
|
|
// The first block can also be the last
|
|
BLS masklast
|
|
// Have we found something already?
|
|
CBNZ R6, tail
|
|
|
|
loop:
|
|
VLD1.P (R3), [V1.B16, V2.B16]
|
|
SUBS $0x20, R2, R2
|
|
// Compare against both needles, OR results
|
|
VCMEQ V0.B16, V1.B16, V3.B16
|
|
VCMEQ V7.B16, V1.B16, V8.B16
|
|
VORR V8.B16, V3.B16, V3.B16
|
|
VCMEQ V0.B16, V2.B16, V4.B16
|
|
VCMEQ V7.B16, V2.B16, V9.B16
|
|
VORR V9.B16, V4.B16, V4.B16
|
|
// If we're out of data we finish regardless of the result
|
|
BLS end
|
|
// Fast check: OR both halves and check for any match
|
|
VORR V4.B16, V3.B16, V6.B16
|
|
VADDP V6.D2, V6.D2, V6.D2
|
|
VMOV V6.D[0], R6
|
|
CBZ R6, loop
|
|
|
|
end:
|
|
// Found something or out of data — build full syndrome
|
|
VAND V5.B16, V3.B16, V3.B16
|
|
VAND V5.B16, V4.B16, V4.B16
|
|
VADDP V4.B16, V3.B16, V6.B16
|
|
VADDP V6.B16, V6.B16, V6.B16
|
|
VMOV V6.D[0], R6
|
|
// Only mask for the last block
|
|
BHS tail
|
|
|
|
masklast:
|
|
// Clear irrelevant upper bits
|
|
ADD R9, R10, R4
|
|
AND $0x1f, R4, R4
|
|
SUB $0x20, R4, R4
|
|
NEG R4<<1, R4
|
|
LSL R4, R6, R6
|
|
LSR R4, R6, R6
|
|
|
|
tail:
|
|
CBZ R6, fail
|
|
RBIT R6, R6
|
|
SUB $0x20, R3, R3
|
|
CLZ R6, R6
|
|
ADD R6>>1, R3, R0
|
|
SUB R11, R0, R0
|
|
MOVD R0, (R8)
|
|
RET
|
|
|
|
fail:
|
|
MOVD $-1, R0
|
|
MOVD R0, (R8)
|
|
RET
|
|
|
|
// func lastIndexByteTwo(s []byte, b1, b2 byte) int
|
|
//
|
|
// Returns the index of the last occurrence of b1 or b2 in s, or -1.
|
|
// Scans backward using ARM64 NEON.
|
|
TEXT ·lastIndexByteTwo(SB),NOSPLIT,$0-40
|
|
MOVD s_base+0(FP), R0
|
|
MOVD s_len+8(FP), R2
|
|
MOVBU b1+24(FP), R1
|
|
MOVBU b2+25(FP), R7
|
|
MOVD $ret+32(FP), R8
|
|
|
|
CBZ R2, lfail
|
|
MOVD R0, R11 // save base
|
|
ADD R0, R2, R12 // R12 = end = base + len
|
|
MOVD $0x40100401, R5
|
|
VMOV R1, V0.B16 // V0 = splat(b1)
|
|
VMOV R7, V7.B16 // V7 = splat(b2)
|
|
VMOV R5, V5.S4
|
|
|
|
// Align: find the aligned block containing the last byte
|
|
SUB $1, R12, R3
|
|
BIC $0x1f, R3, R3 // R3 = start of aligned block containing last byte
|
|
|
|
// --- Process tail block ---
|
|
VLD1 (R3), [V1.B16, V2.B16]
|
|
VCMEQ V0.B16, V1.B16, V3.B16
|
|
VCMEQ V7.B16, V1.B16, V8.B16
|
|
VORR V8.B16, V3.B16, V3.B16
|
|
VCMEQ V0.B16, V2.B16, V4.B16
|
|
VCMEQ V7.B16, V2.B16, V9.B16
|
|
VORR V9.B16, V4.B16, V4.B16
|
|
VAND V5.B16, V3.B16, V3.B16
|
|
VAND V5.B16, V4.B16, V4.B16
|
|
VADDP V4.B16, V3.B16, V6.B16
|
|
VADDP V6.B16, V6.B16, V6.B16
|
|
VMOV V6.D[0], R6
|
|
|
|
// Mask upper bits (bytes past end of slice)
|
|
// tail_bytes = end - R3 (1..32)
|
|
SUB R3, R12, R10 // R10 = tail_bytes
|
|
MOVD $64, R4
|
|
SUB R10<<1, R4, R4 // R4 = 64 - 2*tail_bytes
|
|
LSL R4, R6, R6
|
|
LSR R4, R6, R6
|
|
|
|
// Is this also the head block?
|
|
CMP R11, R3 // R3 - R11
|
|
BLO lmaskfirst // R3 < base: head+tail in same block
|
|
BEQ ltailonly // R3 == base: single aligned block
|
|
|
|
// R3 > base: more blocks before this one
|
|
CBNZ R6, llast
|
|
B lbacksetup
|
|
|
|
ltailonly:
|
|
// Single block, already masked upper bits
|
|
CBNZ R6, llast
|
|
B lfail
|
|
|
|
lmaskfirst:
|
|
// Mask lower bits (bytes before start of slice)
|
|
SUB R3, R11, R4 // R4 = base - R3
|
|
LSL $1, R4, R4
|
|
LSR R4, R6, R6
|
|
LSL R4, R6, R6
|
|
CBNZ R6, llast
|
|
B lfail
|
|
|
|
lbacksetup:
|
|
SUB $0x20, R3
|
|
|
|
lbackloop:
|
|
VLD1 (R3), [V1.B16, V2.B16]
|
|
VCMEQ V0.B16, V1.B16, V3.B16
|
|
VCMEQ V7.B16, V1.B16, V8.B16
|
|
VORR V8.B16, V3.B16, V3.B16
|
|
VCMEQ V0.B16, V2.B16, V4.B16
|
|
VCMEQ V7.B16, V2.B16, V9.B16
|
|
VORR V9.B16, V4.B16, V4.B16
|
|
// Quick check: any match in this block?
|
|
VORR V4.B16, V3.B16, V6.B16
|
|
VADDP V6.D2, V6.D2, V6.D2
|
|
VMOV V6.D[0], R6
|
|
|
|
// Is this a head block? (R3 < base)
|
|
CMP R11, R3
|
|
BLO lheadblock
|
|
|
|
// Full block (R3 >= base)
|
|
CBNZ R6, lbackfound
|
|
// More blocks?
|
|
BEQ lfail // R3 == base, no more
|
|
SUB $0x20, R3
|
|
B lbackloop
|
|
|
|
lbackfound:
|
|
// Build full syndrome
|
|
VAND V5.B16, V3.B16, V3.B16
|
|
VAND V5.B16, V4.B16, V4.B16
|
|
VADDP V4.B16, V3.B16, V6.B16
|
|
VADDP V6.B16, V6.B16, V6.B16
|
|
VMOV V6.D[0], R6
|
|
B llast
|
|
|
|
lheadblock:
|
|
// R3 < base. Build full syndrome if quick check had a match.
|
|
CBZ R6, lfail
|
|
VAND V5.B16, V3.B16, V3.B16
|
|
VAND V5.B16, V4.B16, V4.B16
|
|
VADDP V4.B16, V3.B16, V6.B16
|
|
VADDP V6.B16, V6.B16, V6.B16
|
|
VMOV V6.D[0], R6
|
|
// Mask lower bits
|
|
SUB R3, R11, R4 // R4 = base - R3
|
|
LSL $1, R4, R4
|
|
LSR R4, R6, R6
|
|
LSL R4, R6, R6
|
|
CBZ R6, lfail
|
|
|
|
llast:
|
|
// Find last match: highest set bit in syndrome
|
|
// Syndrome has bit 2i set for matching byte i.
|
|
// CLZ gives leading zeros; byte_offset = (63 - CLZ) / 2.
|
|
CLZ R6, R6
|
|
MOVD $63, R4
|
|
SUB R6, R4, R6 // R6 = 63 - CLZ = bit position
|
|
LSR $1, R6 // R6 = byte offset within block
|
|
ADD R3, R6, R0 // R0 = absolute address
|
|
SUB R11, R0, R0 // R0 = slice index
|
|
MOVD R0, (R8)
|
|
RET
|
|
|
|
lfail:
|
|
MOVD $-1, R0
|
|
MOVD R0, (R8)
|
|
RET
|