Skip to content

Commit e20da46

Browse files
authored
Merge pull request #2437 from dgageot/fix-robots-txt
fix: cache parsed robots.txt per host instead of boolean result
2 parents 3f51652 + 350e632 commit e20da46

2 files changed

Lines changed: 86 additions & 18 deletions

File tree

pkg/tools/builtin/fetch.go

Lines changed: 23 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,8 @@ func (h *fetchHandler) CallTool(ctx context.Context, params FetchToolArgs) (*too
5959

6060
var results []FetchResult
6161

62-
// Group URLs by host to fetch robots.txt once per host
63-
robotsCache := make(map[string]bool)
62+
// Cache parsed robots.txt per host
63+
robotsCache := make(map[string]*robotstxt.RobotsData)
6464

6565
for _, urlStr := range params.URLs {
6666
result := h.fetchURL(ctx, client, urlStr, params.Format, robotsCache)
@@ -91,7 +91,7 @@ type FetchResult struct {
9191
Error string `json:"error,omitempty"`
9292
}
9393

94-
func (h *fetchHandler) fetchURL(ctx context.Context, client *http.Client, urlStr, format string, robotsCache map[string]bool) FetchResult {
94+
func (h *fetchHandler) fetchURL(ctx context.Context, client *http.Client, urlStr, format string, robotsCache map[string]*robotstxt.RobotsData) FetchResult {
9595
result := FetchResult{URL: urlStr}
9696

9797
// Validate URL
@@ -115,13 +115,18 @@ func (h *fetchHandler) fetchURL(ctx context.Context, client *http.Client, urlStr
115115

116116
// Check robots.txt (with caching per host)
117117
host := parsedURL.Host
118-
allowed, cached := robotsCache[host]
118+
robots, cached := robotsCache[host]
119119
if !cached {
120-
allowed = h.checkRobotsAllowed(ctx, client, parsedURL, useragent.Header)
121-
robotsCache[host] = allowed
120+
var err error
121+
robots, err = h.fetchRobots(ctx, client, parsedURL, useragent.Header)
122+
if err != nil {
123+
result.Error = fmt.Sprintf("robots.txt check failed: %v", err)
124+
return result
125+
}
126+
robotsCache[host] = robots
122127
}
123128

124-
if !allowed {
129+
if robots != nil && !robots.TestAgent(parsedURL.Path, useragent.Header) {
125130
result.Error = "URL blocked by robots.txt"
126131
return result
127132
}
@@ -191,7 +196,10 @@ func (h *fetchHandler) fetchURL(ctx context.Context, client *http.Client, urlStr
191196
return result
192197
}
193198

194-
func (h *fetchHandler) checkRobotsAllowed(ctx context.Context, client *http.Client, targetURL *url.URL, userAgent string) bool {
199+
// fetchRobots fetches and parses robots.txt for the given URL's host.
200+
// Returns nil (allow all) if robots.txt is missing or unreachable.
201+
// Returns an error if the server returns a non-OK status or the content cannot be read/parsed.
202+
func (h *fetchHandler) fetchRobots(ctx context.Context, client *http.Client, targetURL *url.URL, userAgent string) (*robotstxt.RobotsData, error) {
195203
// Build robots.txt URL
196204
robotsURL := &url.URL{
197205
Scheme: targetURL.Scheme,
@@ -203,7 +211,7 @@ func (h *fetchHandler) checkRobotsAllowed(ctx context.Context, client *http.Clie
203211
req, err := http.NewRequestWithContext(ctx, http.MethodGet, robotsURL.String(), http.NoBody)
204212
if err != nil {
205213
// If we can't create request, allow the fetch
206-
return true
214+
return nil, nil
207215
}
208216

209217
req.Header.Set("User-Agent", userAgent)
@@ -217,36 +225,33 @@ func (h *fetchHandler) checkRobotsAllowed(ctx context.Context, client *http.Clie
217225
resp, err := robotsClient.Do(req)
218226
if err != nil {
219227
// If robots.txt is unreachable, allow the fetch
220-
return true
228+
return nil, nil
221229
}
222230
defer resp.Body.Close()
223231

224232
// If robots.txt doesn't exist (404), allow the fetch
225233
if resp.StatusCode == http.StatusNotFound {
226-
return true
234+
return nil, nil
227235
}
228236

229237
// For other non-200 status codes, fail the fetch
230238
if resp.StatusCode != http.StatusOK {
231-
return false
239+
return nil, fmt.Errorf("unexpected status %d", resp.StatusCode)
232240
}
233241

234242
// Read robots.txt content (limit to 64KB)
235243
robotsBody, err := io.ReadAll(io.LimitReader(resp.Body, 64*1024))
236244
if err != nil {
237-
// If we can't read robots.txt, fail the fetch
238-
return false
245+
return nil, fmt.Errorf("failed to read robots.txt: %w", err)
239246
}
240247

241248
// Parse robots.txt
242249
robots, err := robotstxt.FromBytes(robotsBody)
243250
if err != nil {
244-
// If we can't parse robots.txt, fail the fetch
245-
return false
251+
return nil, fmt.Errorf("failed to parse robots.txt: %w", err)
246252
}
247253

248-
// Check if the target URL path is allowed for our user agent
249-
return robots.TestAgent(targetURL.Path, userAgent)
254+
return robots, nil
250255
}
251256

252257
func htmlToMarkdown(html string) string {

pkg/tools/builtin/fetch_test.go

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,69 @@ func TestFetch_RobotsMissing(t *testing.T) {
290290
assert.Contains(t, result.Output, "Content without robots.txt")
291291
}
292292

293+
func TestFetch_RobotsCachePerHost_MultipleURLs(t *testing.T) {
294+
// Regression test: robots.txt should be fetched once per host,
295+
// but each URL's path must be evaluated individually.
296+
robotsContent := "User-agent: *\nDisallow: /secret\nAllow: /"
297+
298+
robotsRequests := 0
299+
url := runHTTPServer(t, func(w http.ResponseWriter, r *http.Request) {
300+
switch r.URL.Path {
301+
case "/robots.txt":
302+
robotsRequests++
303+
w.Header().Set("Content-Type", "text/plain")
304+
fmt.Fprint(w, robotsContent)
305+
case "/public":
306+
fmt.Fprint(w, "public content")
307+
case "/secret/data":
308+
fmt.Fprint(w, "secret content")
309+
default:
310+
http.NotFound(w, r)
311+
}
312+
})
313+
314+
tool := NewFetchTool()
315+
result, err := tool.handler.CallTool(t.Context(), FetchToolArgs{
316+
URLs: []string{url + "/public", url + "/secret/data"},
317+
Format: "text",
318+
})
319+
require.NoError(t, err)
320+
321+
var results []FetchResult
322+
err = json.Unmarshal([]byte(result.Output), &results)
323+
require.NoError(t, err)
324+
require.Len(t, results, 2)
325+
326+
// First URL should succeed
327+
assert.Equal(t, 200, results[0].StatusCode)
328+
assert.Equal(t, "public content", results[0].Body)
329+
330+
// Second URL should be blocked
331+
assert.Contains(t, results[1].Error, "URL blocked by robots.txt")
332+
333+
// robots.txt should have been fetched exactly once
334+
assert.Equal(t, 1, robotsRequests, "robots.txt should be fetched once per host")
335+
}
336+
337+
func TestFetch_RobotsUnexpectedStatus(t *testing.T) {
338+
url := runHTTPServer(t, func(w http.ResponseWriter, r *http.Request) {
339+
if r.URL.Path == "/robots.txt" {
340+
w.WriteHeader(http.StatusInternalServerError)
341+
return
342+
}
343+
fmt.Fprint(w, "content")
344+
})
345+
346+
tool := NewFetchTool()
347+
result, err := tool.handler.CallTool(t.Context(), FetchToolArgs{
348+
URLs: []string{url + "/page"},
349+
Format: "text",
350+
})
351+
require.NoError(t, err)
352+
assert.Contains(t, result.Output, "robots.txt check failed")
353+
assert.Contains(t, result.Output, "unexpected status 500")
354+
}
355+
293356
func TestFetchTool_OutputSchema(t *testing.T) {
294357
tool := NewFetchTool()
295358

0 commit comments

Comments
 (0)