Merge pull request #2437 from dgageot/fix-robots-txt

dgageot · web-flow · commit e20da46d8fe7 · 2026-04-15T18:02:37.000+02:00
fix: cache parsed robots.txt per host instead of boolean result
diff --git a/pkg/tools/builtin/fetch.go b/pkg/tools/builtin/fetch.go
@@ -59,8 +59,8 @@ func (h *fetchHandler) CallTool(ctx context.Context, params FetchToolArgs) (*too
 
 	var results []FetchResult
 
-	// Group URLs by host to fetch robots.txt once per host
-	robotsCache := make(map[string]bool)
+	// Cache parsed robots.txt per host
+	robotsCache := make(map[string]*robotstxt.RobotsData)
 
 	for _, urlStr := range params.URLs {
 		result := h.fetchURL(ctx, client, urlStr, params.Format, robotsCache)
@@ -91,7 +91,7 @@ type FetchResult struct {
 	Error         string `json:"error,omitempty"`
 }
 
-func (h *fetchHandler) fetchURL(ctx context.Context, client *http.Client, urlStr, format string, robotsCache map[string]bool) FetchResult {
+func (h *fetchHandler) fetchURL(ctx context.Context, client *http.Client, urlStr, format string, robotsCache map[string]*robotstxt.RobotsData) FetchResult {
 	result := FetchResult{URL: urlStr}
 
 	// Validate URL
@@ -115,13 +115,18 @@ func (h *fetchHandler) fetchURL(ctx context.Context, client *http.Client, urlStr
 
 	// Check robots.txt (with caching per host)
 	host := parsedURL.Host
-	allowed, cached := robotsCache[host]
+	robots, cached := robotsCache[host]
 	if !cached {
-		allowed = h.checkRobotsAllowed(ctx, client, parsedURL, useragent.Header)
-		robotsCache[host] = allowed
+		var err error
+		robots, err = h.fetchRobots(ctx, client, parsedURL, useragent.Header)
+		if err != nil {
+			result.Error = fmt.Sprintf("robots.txt check failed: %v", err)
+			return result
+		}
+		robotsCache[host] = robots
 	}
 
-	if !allowed {
+	if robots != nil && !robots.TestAgent(parsedURL.Path, useragent.Header) {
 		result.Error = "URL blocked by robots.txt"
 		return result
 	}
@@ -191,7 +196,10 @@ func (h *fetchHandler) fetchURL(ctx context.Context, client *http.Client, urlStr
 	return result
 }
 
-func (h *fetchHandler) checkRobotsAllowed(ctx context.Context, client *http.Client, targetURL *url.URL, userAgent string) bool {
+// fetchRobots fetches and parses robots.txt for the given URL's host.
+// Returns nil (allow all) if robots.txt is missing or unreachable.
+// Returns an error if the server returns a non-OK status or the content cannot be read/parsed.
+func (h *fetchHandler) fetchRobots(ctx context.Context, client *http.Client, targetURL *url.URL, userAgent string) (*robotstxt.RobotsData, error) {
 	// Build robots.txt URL
 	robotsURL := &url.URL{
 		Scheme: targetURL.Scheme,
@@ -203,7 +211,7 @@ func (h *fetchHandler) checkRobotsAllowed(ctx context.Context, client *http.Clie
 	req, err := http.NewRequestWithContext(ctx, http.MethodGet, robotsURL.String(), http.NoBody)
 	if err != nil {
 		// If we can't create request, allow the fetch
-		return true
+		return nil, nil
 	}
 
 	req.Header.Set("User-Agent", userAgent)
@@ -217,36 +225,33 @@ func (h *fetchHandler) checkRobotsAllowed(ctx context.Context, client *http.Clie
 	resp, err := robotsClient.Do(req)
 	if err != nil {
 		// If robots.txt is unreachable, allow the fetch
-		return true
+		return nil, nil
 	}
 	defer resp.Body.Close()
 
 	// If robots.txt doesn't exist (404), allow the fetch
 	if resp.StatusCode == http.StatusNotFound {
-		return true
+		return nil, nil
 	}
 
 	// For other non-200 status codes, fail the fetch
 	if resp.StatusCode != http.StatusOK {
-		return false
+		return nil, fmt.Errorf("unexpected status %d", resp.StatusCode)
 	}
 
 	// Read robots.txt content (limit to 64KB)
 	robotsBody, err := io.ReadAll(io.LimitReader(resp.Body, 64*1024))
 	if err != nil {
-		// If we can't read robots.txt, fail the fetch
-		return false
+		return nil, fmt.Errorf("failed to read robots.txt: %w", err)
 	}
 
 	// Parse robots.txt
 	robots, err := robotstxt.FromBytes(robotsBody)
 	if err != nil {
-		// If we can't parse robots.txt, fail the fetch
-		return false
+		return nil, fmt.Errorf("failed to parse robots.txt: %w", err)
 	}
 
-	// Check if the target URL path is allowed for our user agent
-	return robots.TestAgent(targetURL.Path, userAgent)
+	return robots, nil
 }
 
 func htmlToMarkdown(html string) string {
diff --git a/pkg/tools/builtin/fetch_test.go b/pkg/tools/builtin/fetch_test.go
@@ -290,6 +290,69 @@ func TestFetch_RobotsMissing(t *testing.T) {
 	assert.Contains(t, result.Output, "Content without robots.txt")
 }
 
+func TestFetch_RobotsCachePerHost_MultipleURLs(t *testing.T) {
+	// Regression test: robots.txt should be fetched once per host,
+	// but each URL's path must be evaluated individually.
+	robotsContent := "User-agent: *\nDisallow: /secret\nAllow: /"
+
+	robotsRequests := 0
+	url := runHTTPServer(t, func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/robots.txt":
+			robotsRequests++
+			w.Header().Set("Content-Type", "text/plain")
+			fmt.Fprint(w, robotsContent)
+		case "/public":
+			fmt.Fprint(w, "public content")
+		case "/secret/data":
+			fmt.Fprint(w, "secret content")
+		default:
+			http.NotFound(w, r)
+		}
+	})
+
+	tool := NewFetchTool()
+	result, err := tool.handler.CallTool(t.Context(), FetchToolArgs{
+		URLs:   []string{url + "/public", url + "/secret/data"},
+		Format: "text",
+	})
+	require.NoError(t, err)
+
+	var results []FetchResult
+	err = json.Unmarshal([]byte(result.Output), &results)
+	require.NoError(t, err)
+	require.Len(t, results, 2)
+
+	// First URL should succeed
+	assert.Equal(t, 200, results[0].StatusCode)
+	assert.Equal(t, "public content", results[0].Body)
+
+	// Second URL should be blocked
+	assert.Contains(t, results[1].Error, "URL blocked by robots.txt")
+
+	// robots.txt should have been fetched exactly once
+	assert.Equal(t, 1, robotsRequests, "robots.txt should be fetched once per host")
+}
+
+func TestFetch_RobotsUnexpectedStatus(t *testing.T) {
+	url := runHTTPServer(t, func(w http.ResponseWriter, r *http.Request) {
+		if r.URL.Path == "/robots.txt" {
+			w.WriteHeader(http.StatusInternalServerError)
+			return
+		}
+		fmt.Fprint(w, "content")
+	})
+
+	tool := NewFetchTool()
+	result, err := tool.handler.CallTool(t.Context(), FetchToolArgs{
+		URLs:   []string{url + "/page"},
+		Format: "text",
+	})
+	require.NoError(t, err)
+	assert.Contains(t, result.Output, "robots.txt check failed")
+	assert.Contains(t, result.Output, "unexpected status 500")
+}
+
 func TestFetchTool_OutputSchema(t *testing.T) {
 	tool := NewFetchTool()