diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md index ea0cd5ea1..225708675 100644 --- a/DEVELOPER_GUIDE.md +++ b/DEVELOPER_GUIDE.md @@ -175,6 +175,14 @@ CodeIndex exposes an opt-in `ActivitySource` named `CodeIndex`. MCP JSON-RPC fra Set `CDIDX_SLOW_QUERY_MS=` to write slow SQLite command diagnostics to stderr. Query commands also accept `--profile` for a JSON profile block and `--slow-query-ms ` for command-scoped profiling. Slow-query SQL diagnostics are single-line, length-bounded, and redact SQL string/blob/numeric literals before they reach stderr or the global tool log; the logged SQL is intended for operation/shape debugging, not value recovery. +### Resource-Boundary Contracts + +| Path | Contract | +|---|---| +| Worker protocol JSON | Isolated worker stdin frames are read through `BoundedLineReader`. The default frame cap is 32 MiB for both characters and UTF-8 bytes. When a larger `--max-file-bytes` setting needs JSON-escaping headroom, the protocol frame cap may expand up to `WorkerProtocolLineLimits.MaxExtendedLineUtf8Bytes` (384 MiB), never to `int.MaxValue`. `WorkerProtocolJsonValidator` rejects payloads over the negotiated character/UTF-8 byte cap before `JsonDocument.Parse`, parses with `DefaultMaxJsonDepth` (32), rejects more than 1,000,000 object properties, and rejects strings longer than the frame cap. | +| User regex find | `find --regex` keeps the classic .NET regex engine for lookaround/backreference compatibility, adds `RegexOptions.CultureInvariant`, adds `IgnoreCase` unless `--exact` is set, and uses `BoundedRegex.DefaultMatchTimeout` per match. Timeouts surface as `E014_REGEX_MATCH_TIMEOUT` / `regex_timeout` in CLI JSON, and human output includes the same recovery hint. `find --all` additionally applies candidate-file and line-scan caps before walking the whole index. | +| `MaxValue` sentinels | `int.MaxValue` may be used only as an internal sentinel when the next operation clamps before SQL limits, allocation, traversal, payload sizing, or timeout conversion. User-influenced values must be reduced to named practical constants before multiplication, buffer sizing, protocol framing, or query expansion. | + ### Indexing pipeline ``` @@ -2466,6 +2474,14 @@ slow SQLite command diagnostic は `CDIDX_SLOW_QUERY_MS=` で stde query コマンドも JSON profile block 用の `--profile` と command-scoped profiling 用の `--slow-query-ms ` を受け付けます。 +### リソース境界契約 + +| 経路 | 契約 | +|---|---| +| worker protocol JSON | isolated worker の stdin frame は `BoundedLineReader` で読みます。既定の frame 上限は文字数・UTF-8 byte 数ともに 32 MiB です。大きな `--max-file-bytes` によって JSON escape 分の余裕が必要な場合、protocol frame 上限は `WorkerProtocolLineLimits.MaxExtendedLineUtf8Bytes`(384 MiB)まで拡張できますが、`int.MaxValue` までは拡張しません。`WorkerProtocolJsonValidator` は `JsonDocument.Parse` の前に合意済みの文字数 / UTF-8 byte 上限を超える payload を拒否し、`DefaultMaxJsonDepth`(32)で parse し、object property 1,000,000 件超と frame 上限を超える string を拒否します。 | +| user regex find | `find --regex` は lookaround / backreference 互換性のため classic .NET regex engine を維持し、`RegexOptions.CultureInvariant` を付け、`--exact` でない場合は `IgnoreCase` も付け、各 match に `BoundedRegex.DefaultMatchTimeout` を使います。timeout は CLI JSON で `E014_REGEX_MATCH_TIMEOUT` / `regex_timeout` として返り、人間向け出力にも同じ recovery hint が出ます。`find --all` は index 全体を走査する前に candidate file と line scan の上限も適用します。 | +| `MaxValue` sentinel | `int.MaxValue` は、次の操作が SQL limit、allocation、traversal、payload sizing、timeout conversion の前に clamp する場合だけ内部 sentinel として使えます。ユーザー影響値は multiplication、buffer sizing、protocol framing、query expansion の前に、名前付きの実用上限へ落としてください。 | + ### インデックスパイプライン ``` diff --git a/changelog.d/unreleased/4058.security.md b/changelog.d/unreleased/4058.security.md new file mode 100644 index 000000000..68c63b0a5 --- /dev/null +++ b/changelog.d/unreleased/4058.security.md @@ -0,0 +1,18 @@ +--- +category: security +issues: + - 4058 +affected: + - src/CodeIndex/WorkerProtocolJsonValidator.cs + - src/CodeIndex/BoundedLineReader.cs + - src/CodeIndex/Database/DbReader.FilesStatus.cs + - DEVELOPER_GUIDE.md +--- + +## English + +- **Worker JSON and regex resource limits are now explicit (#4058)** — worker protocol JSON is rejected before DOM parsing when it exceeds the negotiated frame cap, oversized `--max-file-bytes` protocol frames clamp to a named 384 MiB ceiling instead of `int.MaxValue`, and `find --regex` timeout behavior is covered by a pathological-input regression test. + +## 日本語 + +- **worker JSON と regex のリソース上限を明確化しました (#4058)** — worker protocol JSON は合意済み frame 上限を超える場合に DOM parse 前で拒否され、過大な `--max-file-bytes` による protocol frame は `int.MaxValue` ではなく名前付きの 384 MiB 上限へ clamp され、`find --regex` の timeout 挙動を pathological input の regression test で確認するようになりました。 diff --git a/src/CodeIndex/BoundedLineReader.cs b/src/CodeIndex/BoundedLineReader.cs index 8465a13d6..8d7ed2eb2 100644 --- a/src/CodeIndex/BoundedLineReader.cs +++ b/src/CodeIndex/BoundedLineReader.cs @@ -46,6 +46,8 @@ internal static class WorkerProtocolLineLimits // file cap after JSON escaping while still bounding line-protocol memory growth. internal const int MaxLineCharacters = 32 * 1024 * 1024; internal const int MaxLineUtf8Bytes = 32 * 1024 * 1024; + internal const int MaxExtendedLineCharacters = 384 * 1024 * 1024; + internal const int MaxExtendedLineUtf8Bytes = 384 * 1024 * 1024; private const long JsonEscapedCharacterBytes = 6; private const long ProtocolEnvelopeBytes = 1024 * 1024; @@ -54,11 +56,13 @@ internal static int ResolveForSourceFileBytes(long? maxFileSizeBytes) if (maxFileSizeBytes is not > 0) return MaxLineUtf8Bytes; + var largestUncappedFileBytes = (MaxExtendedLineUtf8Bytes - ProtocolEnvelopeBytes) / JsonEscapedCharacterBytes; + if (maxFileSizeBytes.Value >= largestUncappedFileBytes) + return MaxExtendedLineUtf8Bytes; + var required = checked(maxFileSizeBytes.Value * JsonEscapedCharacterBytes + ProtocolEnvelopeBytes); if (required <= MaxLineUtf8Bytes) return MaxLineUtf8Bytes; - if (required >= int.MaxValue) - return int.MaxValue; return (int)required; } diff --git a/src/CodeIndex/Database/DbReader.FilesStatus.cs b/src/CodeIndex/Database/DbReader.FilesStatus.cs index 7b811bb2f..c2600c5f6 100644 --- a/src/CodeIndex/Database/DbReader.FilesStatus.cs +++ b/src/CodeIndex/Database/DbReader.FilesStatus.cs @@ -3,11 +3,20 @@ using System.Globalization; using System.Text; using System.Text.RegularExpressions; +using System.Threading; namespace CodeIndex.Database; public partial class DbReader { + private static readonly AsyncLocal FindRegexMatchTimeoutOverride = new(); + + internal static TimeSpan? FindRegexMatchTimeoutForTesting + { + get => FindRegexMatchTimeoutOverride.Value; + set => FindRegexMatchTimeoutOverride.Value = value; + } + public FindResults FindInFiles(string query, int limit, string? lang = null, IReadOnlyList? pathPatterns = null, IReadOnlyList? excludePathPatterns = null, bool excludeTests = false, int before = 0, int after = 0, bool exact = false, int maxLineWidth = LineWidthFormatter.DefaultMaxLineWidth, int? focusLine = null, int? focusColumn = null, bool regex = false, int? maxCandidateFiles = null, int? maxLinesScanned = null) { if (string.IsNullOrWhiteSpace(query) || limit <= 0) @@ -356,9 +365,14 @@ private static Regex CreateFindRegexMatcher(string query, bool exact) var options = RegexOptions.CultureInvariant; if (!exact) options |= RegexOptions.IgnoreCase; - return new Regex(query, options, BoundedRegex.DefaultMatchTimeout); + return new Regex(query, options, ResolveFindRegexMatchTimeout()); } + private static TimeSpan ResolveFindRegexMatchTimeout() + => FindRegexMatchTimeoutForTesting is { } timeout && timeout > TimeSpan.Zero + ? timeout + : BoundedRegex.DefaultMatchTimeout; + private static void AddLineToFindWindow(IndexedLine indexedLine, Queue snippetWindow, Dictionary snippetLinesByNumber) { snippetWindow.Enqueue(indexedLine); diff --git a/src/CodeIndex/WorkerProtocolJsonValidator.cs b/src/CodeIndex/WorkerProtocolJsonValidator.cs index 342e70552..1be8f67a3 100644 --- a/src/CodeIndex/WorkerProtocolJsonValidator.cs +++ b/src/CodeIndex/WorkerProtocolJsonValidator.cs @@ -1,3 +1,4 @@ +using System.Text; using System.Text.Json; namespace CodeIndex; @@ -19,6 +20,12 @@ internal static bool TryValidate(string json, int maxStringCharacters, out strin var maxProperties = MaxJsonPropertiesForTesting ?? DefaultMaxJsonProperties; var effectiveMaxStringCharacters = MaxStringCharactersForTesting ?? maxStringCharacters; var propertyCount = 0; + if (IsPayloadOverLimit(json, maxStringCharacters)) + { + error = SafeDiagnosticFormatter.FormatCategoryType("worker_protocol_error", "json_payload_length_exceeded"); + return false; + } + try { using var document = JsonDocument.Parse(json, new JsonDocumentOptions { MaxDepth = maxDepth }); @@ -38,6 +45,16 @@ private static int ResolveMaxJsonDepth() return maxDepth > 0 ? maxDepth : DefaultMaxJsonDepth; } + private static bool IsPayloadOverLimit(string json, int maxCharactersAndUtf8Bytes) + { + if (maxCharactersAndUtf8Bytes <= 0) + return true; + if (json.Length > maxCharactersAndUtf8Bytes) + return true; + + return Encoding.UTF8.GetByteCount(json) > maxCharactersAndUtf8Bytes; + } + private static void ValidateElement( JsonElement element, int maxProperties, diff --git a/tests/CodeIndex.Tests/IndexCommandRunnerTests.cs b/tests/CodeIndex.Tests/IndexCommandRunnerTests.cs index 86b60491f..e413c268e 100644 --- a/tests/CodeIndex.Tests/IndexCommandRunnerTests.cs +++ b/tests/CodeIndex.Tests/IndexCommandRunnerTests.cs @@ -602,6 +602,17 @@ public void WorkerProtocol_RejectsExcessiveJsonDepth_Issue3908() } } + [Fact] + public void WorkerProtocol_RejectsPayloadOverUtf8FrameLimitBeforeDomParse_Issue4058() + { + var json = "{\"x\":\"あ\"}"; + + var valid = WorkerProtocolJsonValidator.TryValidate(json, json.Length, out var error); + + Assert.False(valid); + Assert.Equal("worker_protocol_error: json_payload_length_exceeded", error); + } + [Fact] public void WorkerProtocol_RejectsOversizedJsonStrings_Issue3759() { @@ -854,6 +865,15 @@ public void SymbolExtractionWorker_StartInfo_RaisesProtocolLimitForLargeFileCap_ startInfo.ArgumentList); } + [Fact] + public void WorkerProtocolLineLimits_ClampHugeFileCapToExtendedProtocolLimit_Issue4058() + { + var protocolLimit = WorkerProtocolLineLimits.ResolveForSourceFileBytes(long.MaxValue); + + Assert.Equal(WorkerProtocolLineLimits.MaxExtendedLineUtf8Bytes, protocolLimit); + Assert.True(protocolLimit < int.MaxValue); + } + [Fact] public void IsolatedWorkers_StartInfo_ShareDefaultsAndProtocolArguments_Issue3703() { diff --git a/tests/CodeIndex.Tests/QueryCommandRunnerSearchTests.cs b/tests/CodeIndex.Tests/QueryCommandRunnerSearchTests.cs index db42c80fa..51cdb4ac3 100644 --- a/tests/CodeIndex.Tests/QueryCommandRunnerSearchTests.cs +++ b/tests/CodeIndex.Tests/QueryCommandRunnerSearchTests.cs @@ -7892,6 +7892,46 @@ public void RunFind_RegexMatcherUsesSharedTimeoutAndCultureInvariant_Issue3559() Assert.True((exact.Options & RegexOptions.CultureInvariant) != 0); } + [Fact] + public void RunFind_RegexPathologicalInputReturnsTimeoutJson_Issue4058() + { + var projectRoot = TestProjectHelper.CreateTempProject("cdidx_find_regex_timeout_4058"); + try + { + var dbPath = TestProjectHelper.CreateProjectDb(projectRoot); + TestProjectHelper.InsertIndexedFile( + dbPath, + "src/Auth.cs", + "csharp", + new string('a', 4096) + "!\n"); + + try + { + DbReader.FindRegexMatchTimeoutForTesting = TimeSpan.FromMilliseconds(1); + + var (exitCode, stdout, stderr) = CaptureConsole(() => QueryCommandRunner.RunFind( + ["^(a+)+$", "--regex", "--db", dbPath, "--path", "src/Auth.cs", "--json"], + _jsonOptions)); + + Assert.Equal(CommandExitCodes.RuntimeError, exitCode); + Assert.Equal(string.Empty, stderr); + using var document = ParseJsonOutput(stdout); + var json = document.RootElement; + Assert.Equal("error", json.GetProperty("status").GetString()); + Assert.Equal("E014_REGEX_MATCH_TIMEOUT", json.GetProperty("error_code").GetString()); + Assert.Equal("regex_timeout", json.GetProperty("category").GetString()); + } + finally + { + DbReader.FindRegexMatchTimeoutForTesting = null; + } + } + finally + { + TestProjectHelper.DeleteDirectory(projectRoot); + } + } + [Fact] public void RunFind_RegexTimeoutWritesRuntimeErrorJsonMetadata_Issue3559() {