@@ -59,8 +59,8 @@ func (h *fetchHandler) CallTool(ctx context.Context, params FetchToolArgs) (*too
5959
6060 var results []FetchResult
6161
62- // Group URLs by host to fetch robots.txt once per host
63- robotsCache := make (map [string ]bool )
62+ // Cache parsed robots.txt per host
63+ robotsCache := make (map [string ]* robotstxt. RobotsData )
6464
6565 for _ , urlStr := range params .URLs {
6666 result := h .fetchURL (ctx , client , urlStr , params .Format , robotsCache )
@@ -91,7 +91,7 @@ type FetchResult struct {
9191 Error string `json:"error,omitempty"`
9292}
9393
94- func (h * fetchHandler ) fetchURL (ctx context.Context , client * http.Client , urlStr , format string , robotsCache map [string ]bool ) FetchResult {
94+ func (h * fetchHandler ) fetchURL (ctx context.Context , client * http.Client , urlStr , format string , robotsCache map [string ]* robotstxt. RobotsData ) FetchResult {
9595 result := FetchResult {URL : urlStr }
9696
9797 // Validate URL
@@ -115,13 +115,18 @@ func (h *fetchHandler) fetchURL(ctx context.Context, client *http.Client, urlStr
115115
116116 // Check robots.txt (with caching per host)
117117 host := parsedURL .Host
118- allowed , cached := robotsCache [host ]
118+ robots , cached := robotsCache [host ]
119119 if ! cached {
120- allowed = h .checkRobotsAllowed (ctx , client , parsedURL , useragent .Header )
121- robotsCache [host ] = allowed
120+ var err error
121+ robots , err = h .fetchRobots (ctx , client , parsedURL , useragent .Header )
122+ if err != nil {
123+ result .Error = fmt .Sprintf ("robots.txt check failed: %v" , err )
124+ return result
125+ }
126+ robotsCache [host ] = robots
122127 }
123128
124- if ! allowed {
129+ if robots != nil && ! robots . TestAgent ( parsedURL . Path , useragent . Header ) {
125130 result .Error = "URL blocked by robots.txt"
126131 return result
127132 }
@@ -191,7 +196,10 @@ func (h *fetchHandler) fetchURL(ctx context.Context, client *http.Client, urlStr
191196 return result
192197}
193198
194- func (h * fetchHandler ) checkRobotsAllowed (ctx context.Context , client * http.Client , targetURL * url.URL , userAgent string ) bool {
199+ // fetchRobots fetches and parses robots.txt for the given URL's host.
200+ // Returns nil (allow all) if robots.txt is missing or unreachable.
201+ // Returns an error if the server returns a non-OK status or the content cannot be read/parsed.
202+ func (h * fetchHandler ) fetchRobots (ctx context.Context , client * http.Client , targetURL * url.URL , userAgent string ) (* robotstxt.RobotsData , error ) {
195203 // Build robots.txt URL
196204 robotsURL := & url.URL {
197205 Scheme : targetURL .Scheme ,
@@ -203,7 +211,7 @@ func (h *fetchHandler) checkRobotsAllowed(ctx context.Context, client *http.Clie
203211 req , err := http .NewRequestWithContext (ctx , http .MethodGet , robotsURL .String (), http .NoBody )
204212 if err != nil {
205213 // If we can't create request, allow the fetch
206- return true
214+ return nil , nil
207215 }
208216
209217 req .Header .Set ("User-Agent" , userAgent )
@@ -217,36 +225,33 @@ func (h *fetchHandler) checkRobotsAllowed(ctx context.Context, client *http.Clie
217225 resp , err := robotsClient .Do (req )
218226 if err != nil {
219227 // If robots.txt is unreachable, allow the fetch
220- return true
228+ return nil , nil
221229 }
222230 defer resp .Body .Close ()
223231
224232 // If robots.txt doesn't exist (404), allow the fetch
225233 if resp .StatusCode == http .StatusNotFound {
226- return true
234+ return nil , nil
227235 }
228236
229237 // For other non-200 status codes, fail the fetch
230238 if resp .StatusCode != http .StatusOK {
231- return false
239+ return nil , fmt . Errorf ( "unexpected status %d" , resp . StatusCode )
232240 }
233241
234242 // Read robots.txt content (limit to 64KB)
235243 robotsBody , err := io .ReadAll (io .LimitReader (resp .Body , 64 * 1024 ))
236244 if err != nil {
237- // If we can't read robots.txt, fail the fetch
238- return false
245+ return nil , fmt .Errorf ("failed to read robots.txt: %w" , err )
239246 }
240247
241248 // Parse robots.txt
242249 robots , err := robotstxt .FromBytes (robotsBody )
243250 if err != nil {
244- // If we can't parse robots.txt, fail the fetch
245- return false
251+ return nil , fmt .Errorf ("failed to parse robots.txt: %w" , err )
246252 }
247253
248- // Check if the target URL path is allowed for our user agent
249- return robots .TestAgent (targetURL .Path , userAgent )
254+ return robots , nil
250255}
251256
252257func htmlToMarkdown (html string ) string {
0 commit comments