Skip to content

Commit f25ae18

Browse files
committed
feat: text & size filters [PS-105]
1 parent 9488552 commit f25ae18

File tree

5 files changed

+199
-3
lines changed

5 files changed

+199
-3
lines changed

pkg/utils/file_filter/file_filter.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ func WithSecretsFileFilter(path string, logger *zerolog.Logger) FileFilterOption
4848
return fmt.Errorf("error creating secrets filter: %w", err)
4949

5050
}
51-
filter.filterStrategies = append(filter.filterStrategies, secretsFilter)
51+
filter.filterStrategies = append(filter.filterStrategies, secretsFilter...)
5252
return nil
5353
}
5454
}

pkg/utils/file_filter/ignores_file_filter.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ type IgnoresFileFilter struct {
1515
ignores *gitignore.GitIgnore
1616
}
1717

18-
func NewIgnoresFileFilterFromIgnoreFiles(path string, ignoresFiles []string, logger *zerolog.Logger) (Filterable, error) {
18+
func NewIgnoresFileFilterFromIgnoreFiles(path string, ignoresFiles []string, logger *zerolog.Logger) (*IgnoresFileFilter, error) {
1919
ff := FileFilter{
2020
path: path,
2121
logger: logger,
@@ -31,7 +31,8 @@ func NewIgnoresFileFilterFromIgnoreFiles(path string, ignoresFiles []string, log
3131
return &IgnoresFileFilter{ignores: gitignore.CompileIgnoreLines(rules...)}, nil
3232
}
3333

34-
func NewIgnoresFileFilterFromGlobs(globs []string) (Filterable, error) {
34+
// For any other glob like filtering
35+
func NewIgnoresFileFilterFromGlobs(globs []string) (*IgnoresFileFilter, error) {
3536
return &IgnoresFileFilter{ignores: gitignore.CompileIgnoreLines(globs...)}, nil
3637
}
3738

pkg/utils/file_filter/secrets_file_filter.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@ package file_filter
22

33
import "github.com/rs/zerolog"
44

5+
const (
6+
MaxFileSize = 5000000 // 50 MB
7+
)
8+
59
var ignoredExtensionsGlob = []string{
610
"*.bmp", "*.dcm", "*.gif", "*.iff",
711
"*.jpg", "*.jpeg", "*.pbm", "*.pict",
@@ -99,4 +103,9 @@ func NewSecretsFileFilter(path string, logger *zerolog.Logger) ([]Filterable, er
99103
return nil, err
100104
}
101105

106+
textFileFilter := NewTextFileFilter(logger)
107+
108+
sizeFileFilter := NewFileSizeFilter(logger, MaxFileSize)
109+
110+
return []Filterable{sizeFileFilter, globFilter, ruleFilter, textFileFilter}, nil
102111
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
package file_filter
2+
3+
import (
4+
"github.com/rs/zerolog"
5+
"os"
6+
)
7+
8+
type FileSizeFilter struct {
9+
maxSize int64
10+
logger *zerolog.Logger
11+
}
12+
13+
func NewFileSizeFilter(logger *zerolog.Logger, maxSize int64) *FileSizeFilter {
14+
return &FileSizeFilter{
15+
maxSize: maxSize,
16+
logger: logger,
17+
}
18+
}
19+
20+
func (f *FileSizeFilter) Filter(path string) bool {
21+
// Get file size
22+
info, statErr := os.Stat(path)
23+
if statErr != nil {
24+
// Filters are enforced, we should exclude any files that we can't classify
25+
f.logger.Error().Msgf("failed to get file stats: %v", statErr)
26+
return true
27+
}
28+
size := info.Size()
29+
if size == 0 || size > f.maxSize {
30+
return true
31+
}
32+
return false
33+
}
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
package file_filter
2+
3+
import (
4+
"bytes"
5+
"errors"
6+
"fmt"
7+
"github.com/rs/zerolog"
8+
"io"
9+
"os"
10+
)
11+
12+
const (
13+
// _MinNullsForUTF16Heuristic is the minimum number of nulls needed to trust the pattern
14+
// A single stray null byte isn't a pattern.
15+
_MinNullsForUTF16Heuristic = 4
16+
// _FileHeaderSampleSize is the number of bytes read from a file in order to
17+
// determine if it's text or binary.
18+
_FileHeaderSampleSize = 512
19+
// _UTF16PatternThreshold is how strong the pattern must be (e.g., 0.9 = 90%)
20+
// 90% of nulls must be on *either* even or odd indices to be considered UTF-16.
21+
_UTF16PatternThreshold = 0.90
22+
)
23+
24+
// BOM(Byte Order Mark) definitions.
25+
var (
26+
bomUTF16LE = []byte{0xFF, 0xFE}
27+
bomUTF16BE = []byte{0xFE, 0xFF}
28+
)
29+
30+
type TextFileFilter struct {
31+
logger *zerolog.Logger
32+
}
33+
34+
func NewTextFileFilter(logger *zerolog.Logger) *TextFileFilter {
35+
return &TextFileFilter{logger: logger}
36+
}
37+
38+
func (f *TextFileFilter) Filter(path string) bool {
39+
// Attempt to read the file header
40+
header, err := readFileHeader(path, _FileHeaderSampleSize)
41+
if err != nil {
42+
// Filters are enforced, we should exclude any files that we can't classify because of missing file header
43+
f.logger.Error().Msgf("failed to read file header stats: %v", err)
44+
return true
45+
}
46+
return !isTextContent(header)
47+
}
48+
49+
// isTextContent determines if the data slice contains text content
50+
// based on the null byte method. See: https://docs.google.com/document/d/1GYir_j0ITTxg_CqyAw8BeUZYCCUyNMAePbGw5nsTGYE/
51+
func isTextContent(data []byte) bool {
52+
// Empty files are considered text
53+
if len(data) == 0 {
54+
return true
55+
}
56+
// Fast path for common text files (no nulls)
57+
if bytes.IndexByte(data, 0x00) == -1 {
58+
return true
59+
}
60+
61+
// Nulls ARE present from this point on
62+
// Check for definitive text BOMs
63+
if isText, _ := checkBOM(data); isText {
64+
return true
65+
}
66+
67+
// No BOM. Run the null-pattern heuristic to check for UTF-16
68+
// If the check fails, it's binary (sparse or random nulls)
69+
isText, _ := checkUTF16Heuristic(data)
70+
return isText
71+
}
72+
73+
// Helper functions
74+
// checkBOM looks for known Unicode Byte Order Marks that signify text.
75+
func checkBOM(header []byte) (isText bool, reason string) {
76+
if bytes.HasPrefix(header, bomUTF16LE) {
77+
return true, "utf-16-le-bom"
78+
}
79+
if bytes.HasPrefix(header, bomUTF16BE) {
80+
return true, "utf-16-be-bom"
81+
}
82+
return false, ""
83+
}
84+
85+
// checkUTF16Heuristic analyzes the *pattern* of null bytes to guess if it's UTF-16
86+
// It returns (isText, reason). If isText is false, the reason explains why it's
87+
// classified as binary (e.g., "has-null-random").
88+
func checkUTF16Heuristic(header []byte) (isText bool, reason string) {
89+
var oddNulls, evenNulls, totalNulls int
90+
91+
// Count nulls at even vs. odd indices
92+
for i, b := range header {
93+
if b != 0x00 {
94+
continue
95+
}
96+
97+
totalNulls++
98+
if i%2 == 0 {
99+
evenNulls++
100+
} else {
101+
oddNulls++
102+
}
103+
}
104+
// Check if we have enough data to make a guess
105+
if totalNulls < _MinNullsForUTF16Heuristic {
106+
// Not enough nulls for a pattern. Safer to assume binary
107+
return false, "has-null-sparse"
108+
}
109+
110+
// Calculate the pattern strength
111+
evenShare := float64(evenNulls) / float64(totalNulls)
112+
oddShare := float64(oddNulls) / float64(totalNulls)
113+
114+
// Check if the pattern is strong enough
115+
if evenShare > _UTF16PatternThreshold || oddShare > _UTF16PatternThreshold {
116+
// >90% of nulls are on one side. This is a strong UTF-16 signal
117+
return true, "utf-16-heuristic"
118+
}
119+
// Default: Nulls are present but scattered randomly
120+
return false, "has-null-random"
121+
}
122+
123+
func readFileHeader(path string, n int64) ([]byte, error) {
124+
f, err := os.Open(path)
125+
if err != nil {
126+
return nil, fmt.Errorf("failed to open file: %w", err)
127+
}
128+
defer f.Close()
129+
130+
// Get file size
131+
stat, statErr := f.Stat()
132+
if statErr != nil {
133+
return nil, fmt.Errorf("failed to get file stats: %w", statErr)
134+
}
135+
size := stat.Size()
136+
137+
// Determine the smaller of n or the actual file size
138+
bytesToRead := min(n, size)
139+
// Handle 0-byte files
140+
if bytesToRead == 0 {
141+
return []byte{}, nil
142+
}
143+
144+
// Allocate the exact buffer size
145+
buf := make([]byte, int(bytesToRead))
146+
nr, err := f.Read(buf)
147+
// We only return an error if it's *not* io.EOF
148+
if err != nil && !errors.Is(err, io.EOF) {
149+
return nil, fmt.Errorf("failed to read bytes: %w", err)
150+
}
151+
// Return the actual bytes read (buf[:nr])
152+
return buf[:nr], nil
153+
}

0 commit comments

Comments
 (0)