From 493b19426ac917808bbc254326cbb896700bddcd Mon Sep 17 00:00:00 2001 From: Mykhailo Chalyi Date: Wed, 3 Jun 2026 03:53:44 +0000 Subject: [PATCH 1/3] feat(grep): real PCRE -P via fancy-regex and GNU long-option aliases -P now routes to the backtracking fancy_regex engine (bounded by FANCY_BACKTRACK_LIMIT) instead of aliasing ERE, so lookaround and backreferences work like GNU grep -P. Recursive -P bypasses the indexed search fast-path so a backend regex that can't speak PCRE cannot drop real matches. A shared Matcher enum in search_common.rs hides the two engines' differing APIs. Adds GNU long-option aliases for every supported short flag (--ignore-case, --invert-match, --max-count=N, --regexp=PAT, etc.), accepting both --name=value and --name value forms, plus -G/--basic-regexp. Also fixes -b with -o to report the match's byte offset rather than the line start. --- crates/bashkit/src/builtins/grep.rs | 317 +++++++++++++++++-- crates/bashkit/src/builtins/search_common.rs | 59 ++++ specs/implementation-status.md | 17 +- 3 files changed, 356 insertions(+), 37 deletions(-) diff --git a/crates/bashkit/src/builtins/grep.rs b/crates/bashkit/src/builtins/grep.rs index ba88b0988..b8e3c52f0 100644 --- a/crates/bashkit/src/builtins/grep.rs +++ b/crates/bashkit/src/builtins/grep.rs @@ -37,9 +37,10 @@ //! grep --line-buffered pattern # line-buffered (no-op) use async_trait::async_trait; -use regex::Regex; -use super::search_common::{build_regex_opts, parse_numeric_flag_arg}; +use super::search_common::{ + Matcher, build_fancy_matcher, build_regex_opts, parse_numeric_flag_arg, +}; use super::{Builtin, Context}; use crate::error::{Error, Result}; use crate::interpreter::ExecResult; @@ -57,6 +58,7 @@ struct GrepOptions { files_with_matches: bool, fixed_strings: bool, extended_regex: bool, + perl_regex: bool, // -P: Perl-compatible (PCRE) via fancy-regex only_matching: bool, word_regex: bool, quiet: bool, @@ -91,6 +93,7 @@ impl GrepOptions { files_with_matches: false, fixed_strings: false, extended_regex: false, + perl_regex: false, only_matching: false, word_regex: false, quiet: false, @@ -134,7 +137,8 @@ impl GrepOptions { 'w' => opts.word_regex = true, 'F' => opts.fixed_strings = true, 'E' => opts.extended_regex = true, - 'P' => opts.extended_regex = true, // Perl regex implies ERE + 'G' => opts.extended_regex = false, // basic regex (BRE, the default) + 'P' => opts.perl_regex = true, // Perl-compatible regex (PCRE) 'q' => opts.quiet = true, 'x' => opts.whole_line = true, 'H' => opts.show_filename = true, @@ -205,29 +209,100 @@ impl GrepOptions { j += 1; } } else if let Some(opt) = arg.strip_prefix("--") { - // Long options + // Long options. GNU getopt_long accepts both `--name=value` and + // `--name value`; split the inline form here and fall back to + // the next argv entry for value-taking options. if opt.is_empty() { // End of options positional.extend(args[i + 1..].iter().cloned()); break; - } else if opt == "color" || opt.starts_with("color=") { - // --color / --color=always/never/auto - no-op (we don't output ANSI) - } else if opt == "line-buffered" { - // --line-buffered - no-op (output is already line-oriented) - } else if let Some(pat) = opt.strip_prefix("include=") { - opts.include_patterns.push(strip_quotes(pat)); - } else if let Some(pat) = opt.strip_prefix("exclude=") { - opts.exclude_patterns.push(strip_quotes(pat)); - } else if let Some(pat) = opt.strip_prefix("exclude-dir=") { - opts.exclude_dir_patterns.push(strip_quotes(pat)); - } else if opt == "files-without-match" { - opts.files_without_matches = true; - } else if opt == "no-messages" { - opts.suppress_errors = true; - } else if opt == "null" { - opts.null_filename = true; } - // Ignore other unknown long options + let (name, inline_val) = match opt.split_once('=') { + Some((n, v)) => (n, Some(v.to_string())), + None => (opt, None), + }; + match name { + // Boolean flags — long-form aliases of the short flags. + "ignore-case" => opts.ignore_case = true, + "no-ignore-case" => opts.ignore_case = false, + "invert-match" => opts.invert_match = true, + "line-number" => opts.line_numbers = true, + "count" => opts.count_only = true, + "files-with-matches" => opts.files_with_matches = true, + "files-without-match" => opts.files_without_matches = true, + "only-matching" => opts.only_matching = true, + "word-regexp" => opts.word_regex = true, + "line-regexp" => opts.whole_line = true, + "fixed-strings" => opts.fixed_strings = true, + "extended-regexp" => opts.extended_regex = true, + "basic-regexp" => opts.extended_regex = false, + "perl-regexp" => opts.perl_regex = true, + "quiet" | "silent" => opts.quiet = true, + "byte-offset" => opts.byte_offset = true, + "text" => opts.binary_as_text = true, + "null-data" => opts.null_terminated = true, + "recursive" => opts.recursive = true, + "no-messages" => opts.suppress_errors = true, + "with-filename" => opts.show_filename = true, + "no-filename" => opts.no_filename = true, + "null" => opts.null_filename = true, + // No-ops: output is already line-oriented and uncolored. + "color" | "colour" | "line-buffered" => {} + // Value-taking options. + "regexp" => { + opts.patterns + .push(long_opt_value(&inline_val, name, &mut i, args)?) + } + "file" => { + opts.pattern_file = Some(long_opt_value(&inline_val, name, &mut i, args)?) + } + "max-count" => { + opts.max_count = Some(parse_long_numeric( + &long_opt_value(&inline_val, name, &mut i, args)?, + name, + )?) + } + "after-context" => { + opts.after_context = parse_long_numeric( + &long_opt_value(&inline_val, name, &mut i, args)?, + name, + )? + } + "before-context" => { + opts.before_context = parse_long_numeric( + &long_opt_value(&inline_val, name, &mut i, args)?, + name, + )? + } + "context" => { + let ctx = parse_long_numeric( + &long_opt_value(&inline_val, name, &mut i, args)?, + name, + )?; + opts.before_context = ctx; + opts.after_context = ctx; + } + "include" => opts.include_patterns.push(strip_quotes(&long_opt_value( + &inline_val, + name, + &mut i, + args, + )?)), + "exclude" => opts.exclude_patterns.push(strip_quotes(&long_opt_value( + &inline_val, + name, + &mut i, + args, + )?)), + "exclude-dir" => opts.exclude_dir_patterns.push(strip_quotes(&long_opt_value( + &inline_val, + name, + &mut i, + args, + )?)), + // Ignore other unknown long options. + _ => {} + } } else { positional.push(arg.clone()); } @@ -248,7 +323,7 @@ impl GrepOptions { Ok(opts) } - fn build_regex(&self) -> Result { + fn build_matcher(&self) -> Result { // Build patterns for each -e pattern let escaped_patterns: Vec = self .patterns @@ -260,6 +335,10 @@ impl GrepOptions { } let pat = if self.fixed_strings { regex::escape(p) + } else if self.perl_regex { + // PCRE mode (-P): pass through to fancy-regex unchanged so + // lookaround / backreferences are preserved. + p.clone() } else if !self.extended_regex { // BRE mode: convert to ERE for the regex crate // In BRE: ( ) are literal, \( \) are groups @@ -295,11 +374,45 @@ impl GrepOptions { combined }; - build_regex_opts(&final_pattern, self.ignore_case) - .map_err(|e| Error::Execution(format!("grep: invalid pattern: {}", e))) + // -P uses the backtracking PCRE engine; everything else uses the + // default linear-time engine. `-F` (fixed strings) takes precedence + // over `-P`, matching GNU grep. + if self.perl_regex && !self.fixed_strings { + build_fancy_matcher(&final_pattern, self.ignore_case) + .map_err(|e| Error::Execution(format!("grep: invalid pattern: {}", e))) + } else { + build_regex_opts(&final_pattern, self.ignore_case) + .map(Matcher::Standard) + .map_err(|e| Error::Execution(format!("grep: invalid pattern: {}", e))) + } } } +/// Resolve the value of a value-taking long option: prefer the inline +/// `--name=value` form, else consume the next argv entry (`--name value`). +fn long_opt_value( + inline: &Option, + name: &str, + i: &mut usize, + args: &[String], +) -> Result { + if let Some(v) = inline { + Ok(v.clone()) + } else { + *i += 1; + args.get(*i).cloned().ok_or_else(|| { + Error::Execution(format!("grep: option '--{}' requires an argument", name)) + }) + } +} + +/// Parse a non-negative integer for a numeric long option (`--max-count`, etc.). +fn parse_long_numeric(value: &str, name: &str) -> Result { + value + .parse() + .map_err(|_| Error::Execution(format!("grep: invalid --{} value: {}", name, value))) +} + /// Strip surrounding single or double quotes from a value fn strip_quotes(s: &str) -> String { if let Some(inner) = s.strip_prefix('\'').and_then(|s| s.strip_suffix('\'')) { @@ -384,7 +497,7 @@ impl Builtin for Grep { async fn execute(&self, ctx: Context<'_>) -> Result { if let Some(r) = super::check_help_version( ctx.args, - "Usage: grep [OPTION]... PATTERN [FILE]...\nSearch for PATTERN in each FILE.\n\n -i\t\t\tignore case distinctions\n -v\t\t\tselect non-matching lines\n -n\t\t\tprint line number with output lines\n -c\t\t\tprint only a count of matching lines\n -l\t\t\tprint only names of files with matches\n -L\t\t\tprint only names of files without matches\n -o\t\t\tshow only the matching part of lines\n -q\t\t\tsuppress all normal output\n -w\t\t\tmatch whole words only\n -x\t\t\tmatch whole lines only\n -m NUM\t\tstop after NUM matches\n -E\t\t\textended regular expressions\n -F\t\t\tfixed string matching\n -P\t\t\tPerl-compatible regular expressions\n -e PATTERN\t\tuse PATTERN for matching\n -f FILE\t\tread patterns from FILE\n -A NUM\t\tprint NUM lines of trailing context\n -B NUM\t\tprint NUM lines of leading context\n -C NUM\t\tprint NUM lines of output context\n -H\t\t\talways print filename headers\n -h\t\t\tsuppress filename headers\n -b\t\t\tprint byte offset of matches\n -a\t\t\ttreat binary files as text\n -z\t\t\tuse NUL as line separator\n -r\t\t\trecursive search\n -s\t\t\tsuppress error messages\n -Z\t\t\tprint NUL after filenames\n --include=GLOB\tsearch only files matching GLOB\n --exclude=GLOB\tskip files matching GLOB\n --exclude-dir=GLOB\tskip directories matching GLOB\n --color=WHEN\t\tcolor output (no-op)\n --line-buffered\tline-buffered output (no-op)\n --help\t\tdisplay this help and exit\n --version\t\toutput version information and exit\n", + "Usage: grep [OPTION]... PATTERN [FILE]...\nSearch for PATTERN in each FILE.\n\n -i\t\t\tignore case distinctions\n -v\t\t\tselect non-matching lines\n -n\t\t\tprint line number with output lines\n -c\t\t\tprint only a count of matching lines\n -l\t\t\tprint only names of files with matches\n -L\t\t\tprint only names of files without matches\n -o\t\t\tshow only the matching part of lines\n -q\t\t\tsuppress all normal output\n -w\t\t\tmatch whole words only\n -x\t\t\tmatch whole lines only\n -m NUM\t\tstop after NUM matches\n -E\t\t\textended regular expressions\n -F\t\t\tfixed string matching\n -G\t\t\tbasic regular expressions (default)\n -P\t\t\tPerl-compatible regular expressions\n -e PATTERN\t\tuse PATTERN for matching\n -f FILE\t\tread patterns from FILE\n -A NUM\t\tprint NUM lines of trailing context\n -B NUM\t\tprint NUM lines of leading context\n -C NUM\t\tprint NUM lines of output context\n -H\t\t\talways print filename headers\n -h\t\t\tsuppress filename headers\n -b\t\t\tprint byte offset of matches\n -a\t\t\ttreat binary files as text\n -z\t\t\tuse NUL as line separator\n -r\t\t\trecursive search\n -s\t\t\tsuppress error messages\n -Z\t\t\tprint NUL after filenames\n --include=GLOB\tsearch only files matching GLOB\n --exclude=GLOB\tskip files matching GLOB\n --exclude-dir=GLOB\tskip directories matching GLOB\n --color=WHEN\t\tcolor output (no-op)\n --line-buffered\tline-buffered output (no-op)\n --help\t\tdisplay this help and exit\n --version\t\toutput version information and exit\n", Some("grep (bashkit) 0.1"), ) { return Ok(r); @@ -418,7 +531,7 @@ impl Builtin for Grep { return Err(Error::Execution("grep: missing pattern".to_string())); } - let regex = opts.build_regex()?; + let matcher = opts.build_matcher()?; let mut output = String::new(); let mut any_match = false; @@ -443,8 +556,14 @@ impl Builtin for Grep { } vec![(stdin_name.to_string(), stdin_content)] } else if opts.recursive { - // Try indexed search via SearchCapable if available - let search_result = try_indexed_search(&*ctx.fs, &opts, ctx.cwd).await; + // Try indexed search via SearchCapable if available. Skip it for -P: + // the backend's regex engine doesn't speak PCRE (lookaround / + // backreferences), so prefiltering there could drop real matches. + let search_result = if opts.perl_regex { + None + } else { + try_indexed_search(&*ctx.fs, &opts, ctx.cwd).await + }; if let Some(indexed_inputs) = search_result { indexed_inputs @@ -586,7 +705,7 @@ impl Builtin for Grep { if opts.only_matching && !opts.invert_match { // -o mode: count each match separately - for _ in regex.find_iter(line) { + for _ in matcher.find_ranges(line) { file_matched = true; if !opts.files_without_matches { any_match = true; @@ -615,7 +734,7 @@ impl Builtin for Grep { break; } } else { - let matches = regex.is_match(line); + let matches = matcher.is_match(line); let should_match = if opts.invert_match { !matches } else { matches }; if should_match { @@ -691,7 +810,7 @@ impl Builtin for Grep { // -o mode: output each match let mut o_matches = 0usize; for (line_num, line) in lines.iter().enumerate() { - for mat in regex.find_iter(line) { + for (start, end) in matcher.find_ranges(line) { if let Some(max) = opts.max_count && o_matches >= max { @@ -702,12 +821,12 @@ impl Builtin for Grep { output.push(fname_sep); } if opts.byte_offset { - output.push_str(&format!("{}:", byte_offsets[line_num])); + output.push_str(&format!("{}:", byte_offsets[line_num] + start)); } if opts.line_numbers { output.push_str(&format!("{}:", line_num + 1)); } - output.push_str(mat.as_str()); + output.push_str(&line[start..end]); output.push('\n'); o_matches += 1; } @@ -1685,4 +1804,136 @@ mod tests { &["regex::Error", "ParseError {"], ); } + + // -P (PCRE via fancy-regex) capability tests. + + #[tokio::test] + async fn test_grep_perl_lookahead() { + // Lookahead is unsupported by the default `regex` engine; -P must + // route to fancy-regex. Match "foo" only when followed by "bar". + let result = run_grep(&["-oP", r"foo(?=bar)"], Some("foobar\nfoobaz\nfoo")) + .await + .unwrap(); + assert_eq!(result.exit_code, 0); + assert_eq!(result.stdout, "foo\n"); + } + + #[tokio::test] + async fn test_grep_perl_backreference() { + // Backreferences are rejected by `regex`; fancy-regex accepts them. + let result = run_grep(&["-P", r"(\w+) \1"], Some("hello hello\nhello world")) + .await + .unwrap(); + assert_eq!(result.exit_code, 0); + assert_eq!(result.stdout, "hello hello\n"); + } + + #[tokio::test] + async fn test_grep_perl_lookbehind() { + let result = run_grep(&["-oP", r"(?<=\$)\d+"], Some("price $42 and 99")) + .await + .unwrap(); + assert_eq!(result.exit_code, 0); + assert_eq!(result.stdout, "42\n"); + } + + #[tokio::test] + async fn test_grep_perl_long_flag() { + let result = run_grep(&["--perl-regexp", r"\d{3}"], Some("ab12\nxy345")) + .await + .unwrap(); + assert_eq!(result.exit_code, 0); + assert_eq!(result.stdout, "xy345\n"); + } + + #[tokio::test] + async fn test_grep_perl_invalid_pattern_errors() { + // Unbalanced group: fancy-regex rejects it; we surface an error, not a + // panic, and must not leak the engine's Debug shape. + let result = run_grep(&["-P", "(foo"], Some("foo")).await; + assert!(result.is_err()); + } + + // GNU long-option alias capability tests. + + #[tokio::test] + async fn test_grep_long_ignore_case() { + let result = run_grep(&["--ignore-case", "HELLO"], Some("Hello World\ngoodbye")) + .await + .unwrap(); + assert_eq!(result.exit_code, 0); + assert_eq!(result.stdout, "Hello World\n"); + } + + #[tokio::test] + async fn test_grep_long_invert_and_line_number() { + let result = run_grep( + &["--invert-match", "--line-number", "hello"], + Some("hello\nworld\nhello again"), + ) + .await + .unwrap(); + assert_eq!(result.exit_code, 0); + assert_eq!(result.stdout, "2:world\n"); + } + + #[tokio::test] + async fn test_grep_long_max_count_inline() { + let result = run_grep(&["--max-count=2", "o"], Some("foo\nbar\nboo\nzoo")) + .await + .unwrap(); + assert_eq!(result.exit_code, 0); + assert_eq!(result.stdout, "foo\nboo\n"); + } + + #[tokio::test] + async fn test_grep_long_max_count_separate_arg() { + // Space-separated value form: `--max-count 1`. + let result = run_grep(&["--max-count", "1", "o"], Some("foo\nboo\nzoo")) + .await + .unwrap(); + assert_eq!(result.exit_code, 0); + assert_eq!(result.stdout, "foo\n"); + } + + #[tokio::test] + async fn test_grep_long_regexp_multiple() { + let result = run_grep(&["--regexp=foo", "--regexp", "baz"], Some("foo\nbar\nbaz")) + .await + .unwrap(); + assert_eq!(result.exit_code, 0); + assert_eq!(result.stdout, "foo\nbaz\n"); + } + + #[tokio::test] + async fn test_grep_long_fixed_strings() { + let result = run_grep(&["--fixed-strings", "a.b"], Some("a.b\naxb")) + .await + .unwrap(); + assert_eq!(result.exit_code, 0); + assert_eq!(result.stdout, "a.b\n"); + } + + #[tokio::test] + async fn test_grep_long_word_regexp() { + let result = run_grep(&["--word-regexp", "foo"], Some("foo\nfoobar\nbar foo")) + .await + .unwrap(); + assert_eq!(result.exit_code, 0); + assert_eq!(result.stdout, "foo\nbar foo\n"); + } + + #[tokio::test] + async fn test_grep_long_missing_value_errors() { + let result = run_grep(&["--max-count"], Some("foo")).await; + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_grep_only_matching_byte_offset() { + // -b with -o reports the byte offset of the match, not the line start. + let result = run_grep(&["-ob", "bar"], Some("foobar\n")).await.unwrap(); + assert_eq!(result.exit_code, 0); + assert_eq!(result.stdout, "3:bar\n"); + } } diff --git a/crates/bashkit/src/builtins/search_common.rs b/crates/bashkit/src/builtins/search_common.rs index 1f7b7082e..409615c7e 100644 --- a/crates/bashkit/src/builtins/search_common.rs +++ b/crates/bashkit/src/builtins/search_common.rs @@ -13,6 +13,11 @@ pub(crate) const REGEX_SIZE_LIMIT: usize = 1_000_000; /// Default DFA size limit (1 MB). pub(crate) const REGEX_DFA_SIZE_LIMIT: usize = 1_000_000; +/// Backtracking step cap for the fancy-regex (PCRE / `grep -P`) engine. +/// Bounds worst-case backtracking so a pathological pattern can't hang the +/// sandbox (mirrors `sed.rs`'s fallback limit; see TM-INF threat model). +pub(crate) const FANCY_BACKTRACK_LIMIT: usize = 1_000_000; + /// Build a regex with enforced size limits. pub(crate) fn build_regex(pattern: &str) -> std::result::Result { build_regex_opts(pattern, false) @@ -30,6 +35,60 @@ pub(crate) fn build_regex_opts( .build() } +/// A compiled search pattern backed by either the default linear-time `regex` +/// engine or the backtracking `fancy_regex` engine (used for `grep -P`, +/// enabling lookaround and backreferences that `regex` rejects). +/// +/// The two engines expose different APIs (`fancy_regex` returns `Result` from +/// `is_match`/`find_iter`); this enum hides that behind a uniform surface so +/// callers needn't branch on the engine. Match failures from the backtracking +/// engine (e.g. hitting `FANCY_BACKTRACK_LIMIT`) are treated as "no match". +pub(crate) enum Matcher { + Standard(Regex), + Fancy(fancy_regex::Regex), +} + +impl Matcher { + /// Whether `text` contains at least one match. + pub(crate) fn is_match(&self, text: &str) -> bool { + match self { + Matcher::Standard(re) => re.is_match(text), + Matcher::Fancy(re) => re.is_match(text).unwrap_or(false), + } + } + + /// Byte ranges `(start, end)` of all non-overlapping matches, left to + /// right. Slice `text[start..end]` to recover the matched substring. + pub(crate) fn find_ranges(&self, text: &str) -> Vec<(usize, usize)> { + match self { + Matcher::Standard(re) => re.find_iter(text).map(|m| (m.start(), m.end())).collect(), + // `find_iter` yields `Result`; `flatten` drops the Err + // arm (backtrack-limit / internal errors) — same "no match" policy + // as `is_match`. + Matcher::Fancy(re) => re + .find_iter(text) + .flatten() + .map(|m| (m.start(), m.end())) + .collect(), + } + } +} + +/// Build a PCRE-style matcher (`grep -P`) with enforced size and backtracking +/// limits and optional case-insensitivity. +pub(crate) fn build_fancy_matcher( + pattern: &str, + case_insensitive: bool, +) -> std::result::Result { + fancy_regex::RegexBuilder::new(pattern) + .case_insensitive(case_insensitive) + .delegate_size_limit(REGEX_SIZE_LIMIT) + .delegate_dfa_size_limit(REGEX_DFA_SIZE_LIMIT) + .backtrack_limit(FANCY_BACKTRACK_LIMIT) + .build() + .map(Matcher::Fancy) +} + /// Parse a numeric flag argument from short-flag character stream. /// /// Handles both `-m5` (value in same arg) and `-m 5` (value in next arg) forms. diff --git a/specs/implementation-status.md b/specs/implementation-status.md index 6d1afb85d..d2c407d9e 100644 --- a/specs/implementation-status.md +++ b/specs/implementation-status.md @@ -368,18 +368,27 @@ None currently tracked. **Skipped Tests: 0** (all grep tests pass) **Implemented Features:** -- Basic flags: `-i`, `-v`, `-c`, `-n`, `-o`, `-l`, `-w`, `-E`, `-F`, `-q`, `-m`, `-x` +- Basic flags: `-i`, `-v`, `-c`, `-n`, `-o`, `-l`, `-w`, `-E`, `-F`, `-G`, `-q`, `-m`, `-x` - Context: `-A`, `-B`, `-C` (after/before/context lines) - Multiple patterns: `-e` - Include/exclude: `--include=GLOB`, `--exclude=GLOB` for recursive search - Pattern file: `-f` (requires file to exist in VFS) - Filename control: `-H` (always show), `-h` (never show) -- Byte offset: `-b` +- Byte offset: `-b` (with `-o`, reports the match's offset, not the line start) - Null-terminated: `-z` (split on `\0` instead of `\n`) - Recursive: `-r`/`-R` (uses VFS read_dir) - Binary handling: `-a` (filter null bytes), auto-detect binary (null byte → "Binary file ... matches") -- Perl regex: `-P` (regex crate supports PCRE features) -- No-op flags: `--color`, `--line-buffered` +- Perl regex: `-P` — true PCRE via `fancy_regex` (lookaround, backreferences; + bounded by `FANCY_BACKTRACK_LIMIT`). Recursive `-P` bypasses the indexed + search fast-path so backend regex semantics can't drop PCRE-only matches. +- GNU long-option aliases for all of the above, e.g. `--ignore-case`, + `--invert-match`, `--line-number`, `--count`, `--only-matching`, + `--word-regexp`, `--line-regexp`, `--fixed-strings`, `--extended-regexp`, + `--basic-regexp`, `--perl-regexp`, `--quiet`/`--silent`, `--byte-offset`, + `--text`, `--null-data`, `--recursive`, `--with-filename`, `--no-filename`, + `--regexp=PAT`, `--file=FILE`, `--max-count=N`, `--after-context=N`, + `--before-context=N`, `--context=N` (both `--name=value` and `--name value`) +- No-op flags: `--color`/`--colour`, `--line-buffered` ### JQ Limitations From 767f3f930a585b247550a7982aa52608b26aa47d Mon Sep 17 00:00:00 2001 From: Mykhailo Chalyi Date: Wed, 3 Jun 2026 04:07:38 +0000 Subject: [PATCH 2/3] test(grep): bound -P backtracking and document TM-DOS-025 Add a catastrophic-backtracking regression test for grep -P and a THREAT[TM-DOS-025] marker at the fancy-regex mitigation point. Update the threat model: the default regex engine is linear-time and the fancy-regex paths (grep -P, sed) are capped by FANCY_BACKTRACK_LIMIT, so TM-DOS-025 moves from Partial to MITIGATED. --- crates/bashkit/src/builtins/grep.rs | 11 +++++++++++ crates/bashkit/src/builtins/search_common.rs | 2 ++ specs/threat-model.md | 4 ++-- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/crates/bashkit/src/builtins/grep.rs b/crates/bashkit/src/builtins/grep.rs index b8e3c52f0..ef109fdb7 100644 --- a/crates/bashkit/src/builtins/grep.rs +++ b/crates/bashkit/src/builtins/grep.rs @@ -1936,4 +1936,15 @@ mod tests { assert_eq!(result.exit_code, 0); assert_eq!(result.stdout, "3:bar\n"); } + + #[tokio::test] + async fn test_grep_perl_catastrophic_backtrack_is_bounded() { + // TM-DOS-025: a classic catastrophic-backtracking pattern against a + // long non-matching line must terminate (backtrack-limit -> "no match") + // rather than hang the sandbox. + let haystack = format!("{}!", "a".repeat(40)); + let result = run_grep(&["-P", r"(a+)+$"], Some(&haystack)).await.unwrap(); + assert_eq!(result.exit_code, 1); + assert_eq!(result.stdout, ""); + } } diff --git a/crates/bashkit/src/builtins/search_common.rs b/crates/bashkit/src/builtins/search_common.rs index 409615c7e..6b0fcfd0a 100644 --- a/crates/bashkit/src/builtins/search_common.rs +++ b/crates/bashkit/src/builtins/search_common.rs @@ -76,6 +76,8 @@ impl Matcher { /// Build a PCRE-style matcher (`grep -P`) with enforced size and backtracking /// limits and optional case-insensitivity. +// THREAT[TM-DOS-025]: fancy-regex is a backtracking engine; the backtrack_limit +// caps worst-case work so a crafted pattern can't hang the sandbox. pub(crate) fn build_fancy_matcher( pattern: &str, case_insensitive: bool, diff --git a/specs/threat-model.md b/specs/threat-model.md index b9e094344..d648ffb6e 100644 --- a/specs/threat-model.md +++ b/specs/threat-model.md @@ -272,7 +272,7 @@ max_ast_depth: 100, // Parser recursion (TM-DOS-022) |----|--------|--------------|------------|--------| | TM-DOS-023 | Long computation | Complex awk/sed regex | Timeout (30s) | **MITIGATED** | | TM-DOS-024 | Parser hang | Malformed input | `parser_timeout` (5s) + `max_parser_operations` | **MITIGATED** | -| TM-DOS-025 | Regex backtrack | `grep "a](*b)*c" file` | Regex crate limits | Partial | +| TM-DOS-025 | Regex backtrack | `grep "a](*b)*c" file`; `grep -P '(a+)+$' file` | Default `regex` engine is linear-time (no catastrophic backtracking); `grep -P`/`sed` fancy-regex paths are capped by `FANCY_BACKTRACK_LIMIT` (1M steps) — a match that exceeds it yields "no match" rather than hanging | **MITIGATED** | | TM-DOS-027 | Builtin parser recursion | Deeply nested awk/jq expressions | `MAX_AWK_PARSER_DEPTH` (100) + `MAX_JQ_JSON_DEPTH` (100) | **MITIGATED** | | TM-DOS-028 | Diff algorithm DoS | `diff` on two large unrelated files | LCS matrix capped at 10M cells; falls back to simple line-by-line output | **MITIGATED** | | TM-DOS-029 | Arithmetic overflow/panic | `$(( 2 ** -1 ))`, `$(( 1 << 64 ))`, `i64::MIN / -1` | Arithmetic ops use `wrapping_*` / saturating semantics — `i64::MIN / -1` and unary negate go through `wrapping_neg`; `<<` / `>>` clamp the shift amount | **MITIGATED** | @@ -1389,7 +1389,7 @@ This section maps former vulnerability IDs to the new threat ID scheme and track | Threat ID | Vulnerability | Impact | Rationale | |-----------|---------------|--------|-----------| | TM-DOS-011 | Symlinks not followed | Functionality gap | By design - prevents symlink attacks | -| TM-DOS-025 | Regex backtracking | CPU exhaustion | Regex crate has internal limits | +| TM-DOS-025 | Regex backtracking | CPU exhaustion | Linear-time `regex` engine by default; fancy-regex paths (`grep -P`, `sed`) capped by `FANCY_BACKTRACK_LIMIT` | | TM-DOS-033 | AWK unbounded loops | CPU exhaustion | 30s timeout backstop | | TM-UNI-004 | Zero-width chars in variable names | Variable confusion | Matches Bash behavior | | TM-UNI-006 | Homoglyph filenames | Visual confusion | Impractical to fully detect | From 4f680c8809b42a27c4c784b4d20ff2f43acfbd4c Mon Sep 17 00:00:00 2001 From: Mykhailo Chalyi Date: Wed, 3 Jun 2026 04:16:52 +0000 Subject: [PATCH 3/3] fix(grep): make -G/-E/-F/-P mutually exclusive and document long opts in --help Address PR review: - Pattern-type flags now go through set_pattern_type(PatternType), so the last of -G/-E/-F/-P (and their long forms) wins, matching GNU grep. Previously -P set perl_regex without clearing extended/fixed, so a later -G/-E had no effect. - --help now lists the GNU long-option aliases alongside each short flag. Adds last-wins tests for both short and long forms. --- crates/bashkit/src/builtins/grep.rs | 77 +++++++++++++++++++++++++---- specs/implementation-status.md | 1 + 2 files changed, 69 insertions(+), 9 deletions(-) diff --git a/crates/bashkit/src/builtins/grep.rs b/crates/bashkit/src/builtins/grep.rs index ef109fdb7..9e153f0f4 100644 --- a/crates/bashkit/src/builtins/grep.rs +++ b/crates/bashkit/src/builtins/grep.rs @@ -48,6 +48,15 @@ use crate::interpreter::ExecResult; /// grep command - pattern matching pub struct Grep; +/// Which pattern syntax to compile with. `-G/-E/-F/-P` select one of these and +/// are mutually exclusive (GNU grep: the last one on the command line wins). +enum PatternType { + Basic, // -G: basic regular expressions (the default) + Extended, // -E: extended regular expressions + Fixed, // -F: literal fixed strings + Perl, // -P: Perl-compatible (PCRE) via fancy-regex +} + struct GrepOptions { patterns: Vec, files: Vec, @@ -135,10 +144,12 @@ impl GrepOptions { 'l' => opts.files_with_matches = true, 'o' => opts.only_matching = true, 'w' => opts.word_regex = true, - 'F' => opts.fixed_strings = true, - 'E' => opts.extended_regex = true, - 'G' => opts.extended_regex = false, // basic regex (BRE, the default) - 'P' => opts.perl_regex = true, // Perl-compatible regex (PCRE) + // Pattern type: -G/-E/-F/-P are mutually exclusive, + // last one wins (GNU grep semantics). + 'F' => opts.set_pattern_type(PatternType::Fixed), + 'E' => opts.set_pattern_type(PatternType::Extended), + 'G' => opts.set_pattern_type(PatternType::Basic), + 'P' => opts.set_pattern_type(PatternType::Perl), 'q' => opts.quiet = true, 'x' => opts.whole_line = true, 'H' => opts.show_filename = true, @@ -233,10 +244,11 @@ impl GrepOptions { "only-matching" => opts.only_matching = true, "word-regexp" => opts.word_regex = true, "line-regexp" => opts.whole_line = true, - "fixed-strings" => opts.fixed_strings = true, - "extended-regexp" => opts.extended_regex = true, - "basic-regexp" => opts.extended_regex = false, - "perl-regexp" => opts.perl_regex = true, + // Pattern type: mutually exclusive, last one wins. + "fixed-strings" => opts.set_pattern_type(PatternType::Fixed), + "extended-regexp" => opts.set_pattern_type(PatternType::Extended), + "basic-regexp" => opts.set_pattern_type(PatternType::Basic), + "perl-regexp" => opts.set_pattern_type(PatternType::Perl), "quiet" | "silent" => opts.quiet = true, "byte-offset" => opts.byte_offset = true, "text" => opts.binary_as_text = true, @@ -323,6 +335,14 @@ impl GrepOptions { Ok(opts) } + /// Select the pattern syntax, clearing any previously-selected type so the + /// last `-G/-E/-F/-P` flag wins (GNU grep semantics). + fn set_pattern_type(&mut self, pt: PatternType) { + self.fixed_strings = matches!(pt, PatternType::Fixed); + self.extended_regex = matches!(pt, PatternType::Extended); + self.perl_regex = matches!(pt, PatternType::Perl); + } + fn build_matcher(&self) -> Result { // Build patterns for each -e pattern let escaped_patterns: Vec = self @@ -497,7 +517,7 @@ impl Builtin for Grep { async fn execute(&self, ctx: Context<'_>) -> Result { if let Some(r) = super::check_help_version( ctx.args, - "Usage: grep [OPTION]... PATTERN [FILE]...\nSearch for PATTERN in each FILE.\n\n -i\t\t\tignore case distinctions\n -v\t\t\tselect non-matching lines\n -n\t\t\tprint line number with output lines\n -c\t\t\tprint only a count of matching lines\n -l\t\t\tprint only names of files with matches\n -L\t\t\tprint only names of files without matches\n -o\t\t\tshow only the matching part of lines\n -q\t\t\tsuppress all normal output\n -w\t\t\tmatch whole words only\n -x\t\t\tmatch whole lines only\n -m NUM\t\tstop after NUM matches\n -E\t\t\textended regular expressions\n -F\t\t\tfixed string matching\n -G\t\t\tbasic regular expressions (default)\n -P\t\t\tPerl-compatible regular expressions\n -e PATTERN\t\tuse PATTERN for matching\n -f FILE\t\tread patterns from FILE\n -A NUM\t\tprint NUM lines of trailing context\n -B NUM\t\tprint NUM lines of leading context\n -C NUM\t\tprint NUM lines of output context\n -H\t\t\talways print filename headers\n -h\t\t\tsuppress filename headers\n -b\t\t\tprint byte offset of matches\n -a\t\t\ttreat binary files as text\n -z\t\t\tuse NUL as line separator\n -r\t\t\trecursive search\n -s\t\t\tsuppress error messages\n -Z\t\t\tprint NUL after filenames\n --include=GLOB\tsearch only files matching GLOB\n --exclude=GLOB\tskip files matching GLOB\n --exclude-dir=GLOB\tskip directories matching GLOB\n --color=WHEN\t\tcolor output (no-op)\n --line-buffered\tline-buffered output (no-op)\n --help\t\tdisplay this help and exit\n --version\t\toutput version information and exit\n", + "Usage: grep [OPTION]... PATTERN [FILE]...\nSearch for PATTERN in each FILE.\n\n -i, --ignore-case\t\tignore case distinctions\n -v, --invert-match\t\tselect non-matching lines\n -n, --line-number\t\tprint line number with output lines\n -c, --count\t\t\tprint only a count of matching lines\n -l, --files-with-matches\tprint only names of files with matches\n -L, --files-without-match\tprint only names of files without matches\n -o, --only-matching\t\tshow only the matching part of lines\n -q, --quiet, --silent\t\tsuppress all normal output\n -w, --word-regexp\t\tmatch whole words only\n -x, --line-regexp\t\tmatch whole lines only\n -m, --max-count=NUM\t\tstop after NUM matches\n -E, --extended-regexp\t\textended regular expressions\n -F, --fixed-strings\t\tfixed string matching\n -G, --basic-regexp\t\tbasic regular expressions (default)\n -P, --perl-regexp\t\tPerl-compatible regular expressions\n -e, --regexp=PATTERN\t\tuse PATTERN for matching\n -f, --file=FILE\t\tread patterns from FILE\n -A, --after-context=NUM\tprint NUM lines of trailing context\n -B, --before-context=NUM\tprint NUM lines of leading context\n -C, --context=NUM\t\tprint NUM lines of output context\n -H, --with-filename\t\talways print filename headers\n -h, --no-filename\t\tsuppress filename headers\n -b, --byte-offset\t\tprint byte offset of matches\n -a, --text\t\t\ttreat binary files as text\n -z, --null-data\t\tuse NUL as line separator\n -r, -R, --recursive\t\trecursive search\n -s, --no-messages\t\tsuppress error messages\n -Z, --null\t\t\tprint NUL after filenames\n --include=GLOB\t\tsearch only files matching GLOB\n --exclude=GLOB\t\tskip files matching GLOB\n --exclude-dir=GLOB\t\tskip directories matching GLOB\n --color=WHEN\t\t\tcolor output (no-op)\n --line-buffered\t\tline-buffered output (no-op)\n --help\t\t\tdisplay this help and exit\n --version\t\t\toutput version information and exit\n", Some("grep (bashkit) 0.1"), ) { return Ok(r); @@ -1846,6 +1866,45 @@ mod tests { assert_eq!(result.stdout, "xy345\n"); } + #[tokio::test] + async fn test_grep_pattern_type_extended_then_perl_last_wins() { + // -E then -P: perl wins, so the backreference compiles and matches. + let result = run_grep(&["-E", "-P", r"(.)\1"], Some("aa\nab")) + .await + .unwrap(); + assert_eq!(result.exit_code, 0); + assert_eq!(result.stdout, "aa\n"); + } + + #[tokio::test] + async fn test_grep_pattern_type_perl_then_extended_last_wins() { + // -P then -E: extended wins, so -E cleared perl_regex and the + // backreference is rejected by the linear-time engine (error). + let result = run_grep(&["-P", "-E", r"(.)\1"], Some("aa")).await; + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_grep_pattern_type_long_options_last_wins() { + // --perl-regexp then --extended-regexp: extended wins -> backref rejected. + let result = run_grep( + &["--perl-regexp", "--extended-regexp", r"(.)\1"], + Some("aa"), + ) + .await; + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_grep_pattern_type_perl_then_fixed_last_wins() { + // -P then -F: fixed wins, so the pattern is matched literally. + let result = run_grep(&["-P", "-F", r"a.b"], Some("a.b\naxb")) + .await + .unwrap(); + assert_eq!(result.exit_code, 0); + assert_eq!(result.stdout, "a.b\n"); + } + #[tokio::test] async fn test_grep_perl_invalid_pattern_errors() { // Unbalanced group: fancy-regex rejects it; we surface an error, not a diff --git a/specs/implementation-status.md b/specs/implementation-status.md index d2c407d9e..435f32dff 100644 --- a/specs/implementation-status.md +++ b/specs/implementation-status.md @@ -369,6 +369,7 @@ None currently tracked. **Implemented Features:** - Basic flags: `-i`, `-v`, `-c`, `-n`, `-o`, `-l`, `-w`, `-E`, `-F`, `-G`, `-q`, `-m`, `-x` + (pattern-type flags `-G`/`-E`/`-F`/`-P` are mutually exclusive; last one wins, per GNU grep) - Context: `-A`, `-B`, `-C` (after/before/context lines) - Multiple patterns: `-e` - Include/exclude: `--include=GLOB`, `--exclude=GLOB` for recursive search