Skip to content

Commit 9c789d9

Browse files
committed
fix: cache parsed robots.txt per host instead of boolean result
The robots.txt cache was keyed by host but stored a boolean from the first URL's path check. This caused subsequent URLs on the same host to skip their own per-path robots.txt evaluation. Cache the parsed *robotstxt.RobotsData per host and call TestAgent for each URL individually. Assisted-By: docker-agent
1 parent e5f7cfa commit 9c789d9

1 file changed

Lines changed: 23 additions & 18 deletions

File tree

pkg/tools/builtin/fetch.go

Lines changed: 23 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,8 @@ func (h *fetchHandler) CallTool(ctx context.Context, params FetchToolArgs) (*too
5959

6060
var results []FetchResult
6161

62-
// Group URLs by host to fetch robots.txt once per host
63-
robotsCache := make(map[string]bool)
62+
// Cache parsed robots.txt per host
63+
robotsCache := make(map[string]*robotstxt.RobotsData)
6464

6565
for _, urlStr := range params.URLs {
6666
result := h.fetchURL(ctx, client, urlStr, params.Format, robotsCache)
@@ -91,7 +91,7 @@ type FetchResult struct {
9191
Error string `json:"error,omitempty"`
9292
}
9393

94-
func (h *fetchHandler) fetchURL(ctx context.Context, client *http.Client, urlStr, format string, robotsCache map[string]bool) FetchResult {
94+
func (h *fetchHandler) fetchURL(ctx context.Context, client *http.Client, urlStr, format string, robotsCache map[string]*robotstxt.RobotsData) FetchResult {
9595
result := FetchResult{URL: urlStr}
9696

9797
// Validate URL
@@ -115,13 +115,18 @@ func (h *fetchHandler) fetchURL(ctx context.Context, client *http.Client, urlStr
115115

116116
// Check robots.txt (with caching per host)
117117
host := parsedURL.Host
118-
allowed, cached := robotsCache[host]
118+
robots, cached := robotsCache[host]
119119
if !cached {
120-
allowed = h.checkRobotsAllowed(ctx, client, parsedURL, useragent.Header)
121-
robotsCache[host] = allowed
120+
var err error
121+
robots, err = h.fetchRobots(ctx, client, parsedURL, useragent.Header)
122+
if err != nil {
123+
result.Error = fmt.Sprintf("robots.txt check failed: %v", err)
124+
return result
125+
}
126+
robotsCache[host] = robots
122127
}
123128

124-
if !allowed {
129+
if robots != nil && !robots.TestAgent(parsedURL.Path, useragent.Header) {
125130
result.Error = "URL blocked by robots.txt"
126131
return result
127132
}
@@ -191,7 +196,10 @@ func (h *fetchHandler) fetchURL(ctx context.Context, client *http.Client, urlStr
191196
return result
192197
}
193198

194-
func (h *fetchHandler) checkRobotsAllowed(ctx context.Context, client *http.Client, targetURL *url.URL, userAgent string) bool {
199+
// fetchRobots fetches and parses robots.txt for the given URL's host.
200+
// Returns nil (allow all) if robots.txt is missing or unreachable.
201+
// Returns an error if the server returns a non-OK status or the content cannot be read/parsed.
202+
func (h *fetchHandler) fetchRobots(ctx context.Context, client *http.Client, targetURL *url.URL, userAgent string) (*robotstxt.RobotsData, error) {
195203
// Build robots.txt URL
196204
robotsURL := &url.URL{
197205
Scheme: targetURL.Scheme,
@@ -203,7 +211,7 @@ func (h *fetchHandler) checkRobotsAllowed(ctx context.Context, client *http.Clie
203211
req, err := http.NewRequestWithContext(ctx, http.MethodGet, robotsURL.String(), http.NoBody)
204212
if err != nil {
205213
// If we can't create request, allow the fetch
206-
return true
214+
return nil, nil
207215
}
208216

209217
req.Header.Set("User-Agent", userAgent)
@@ -217,36 +225,33 @@ func (h *fetchHandler) checkRobotsAllowed(ctx context.Context, client *http.Clie
217225
resp, err := robotsClient.Do(req)
218226
if err != nil {
219227
// If robots.txt is unreachable, allow the fetch
220-
return true
228+
return nil, nil
221229
}
222230
defer resp.Body.Close()
223231

224232
// If robots.txt doesn't exist (404), allow the fetch
225233
if resp.StatusCode == http.StatusNotFound {
226-
return true
234+
return nil, nil
227235
}
228236

229237
// For other non-200 status codes, fail the fetch
230238
if resp.StatusCode != http.StatusOK {
231-
return false
239+
return nil, fmt.Errorf("unexpected status %d", resp.StatusCode)
232240
}
233241

234242
// Read robots.txt content (limit to 64KB)
235243
robotsBody, err := io.ReadAll(io.LimitReader(resp.Body, 64*1024))
236244
if err != nil {
237-
// If we can't read robots.txt, fail the fetch
238-
return false
245+
return nil, fmt.Errorf("failed to read robots.txt: %w", err)
239246
}
240247

241248
// Parse robots.txt
242249
robots, err := robotstxt.FromBytes(robotsBody)
243250
if err != nil {
244-
// If we can't parse robots.txt, fail the fetch
245-
return false
251+
return nil, fmt.Errorf("failed to parse robots.txt: %w", err)
246252
}
247253

248-
// Check if the target URL path is allowed for our user agent
249-
return robots.TestAgent(targetURL.Path, userAgent)
254+
return robots, nil
250255
}
251256

252257
func htmlToMarkdown(html string) string {

0 commit comments

Comments
 (0)