|
1 | 1 | import z from "zod" |
| 2 | +import { Effect } from "effect" |
| 3 | +import { HttpClient, HttpClientRequest, HttpClientResponse } from "effect/unstable/http" |
2 | 4 | import { Tool } from "./tool" |
3 | 5 | import TurndownService from "turndown" |
4 | 6 | import DESCRIPTION from "./webfetch.txt" |
5 | | -import { abortAfterAny } from "../util/abort" |
6 | | -import { iife } from "@/util/iife" |
7 | 7 |
|
8 | 8 | const MAX_RESPONSE_SIZE = 5 * 1024 * 1024 // 5MB |
9 | 9 | const DEFAULT_TIMEOUT = 30 * 1000 // 30 seconds |
10 | 10 | const MAX_TIMEOUT = 120 * 1000 // 2 minutes |
11 | 11 |
|
12 | | -export const WebFetchTool = Tool.define("webfetch", { |
13 | | - description: DESCRIPTION, |
14 | | - parameters: z.object({ |
15 | | - url: z.string().describe("The URL to fetch content from"), |
16 | | - format: z |
17 | | - .enum(["text", "markdown", "html"]) |
18 | | - .default("markdown") |
19 | | - .describe("The format to return the content in (text, markdown, or html). Defaults to markdown."), |
20 | | - timeout: z.number().describe("Optional timeout in seconds (max 120)").optional(), |
21 | | - }), |
22 | | - async execute(params, ctx) { |
23 | | - // Validate URL |
24 | | - if (!params.url.startsWith("http://") && !params.url.startsWith("https://")) { |
25 | | - throw new Error("URL must start with http:// or https://") |
26 | | - } |
27 | | - |
28 | | - await ctx.ask({ |
29 | | - permission: "webfetch", |
30 | | - patterns: [params.url], |
31 | | - always: ["*"], |
32 | | - metadata: { |
33 | | - url: params.url, |
34 | | - format: params.format, |
35 | | - timeout: params.timeout, |
36 | | - }, |
37 | | - }) |
38 | | - |
39 | | - const timeout = Math.min((params.timeout ?? DEFAULT_TIMEOUT / 1000) * 1000, MAX_TIMEOUT) |
40 | | - |
41 | | - const { signal, clearTimeout } = abortAfterAny(timeout, ctx.abort) |
42 | | - |
43 | | - // Build Accept header based on requested format with q parameters for fallbacks |
44 | | - let acceptHeader = "*/*" |
45 | | - switch (params.format) { |
46 | | - case "markdown": |
47 | | - acceptHeader = "text/markdown;q=1.0, text/x-markdown;q=0.9, text/plain;q=0.8, text/html;q=0.7, */*;q=0.1" |
48 | | - break |
49 | | - case "text": |
50 | | - acceptHeader = "text/plain;q=1.0, text/markdown;q=0.9, text/html;q=0.8, */*;q=0.1" |
51 | | - break |
52 | | - case "html": |
53 | | - acceptHeader = "text/html;q=1.0, application/xhtml+xml;q=0.9, text/plain;q=0.8, text/markdown;q=0.7, */*;q=0.1" |
54 | | - break |
55 | | - default: |
56 | | - acceptHeader = |
57 | | - "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8" |
58 | | - } |
59 | | - const headers = { |
60 | | - "User-Agent": |
61 | | - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36", |
62 | | - Accept: acceptHeader, |
63 | | - "Accept-Language": "en-US,en;q=0.9", |
64 | | - } |
65 | | - |
66 | | - const response = await iife(async () => { |
67 | | - try { |
68 | | - const initial = await fetch(params.url, { signal, headers }) |
69 | | - |
70 | | - // Retry with honest UA if blocked by Cloudflare bot detection (TLS fingerprint mismatch) |
71 | | - return initial.status === 403 && initial.headers.get("cf-mitigated") === "challenge" |
72 | | - ? await fetch(params.url, { signal, headers: { ...headers, "User-Agent": "opencode" } }) |
73 | | - : initial |
74 | | - } finally { |
75 | | - clearTimeout() |
76 | | - } |
77 | | - }) |
78 | | - |
79 | | - if (!response.ok) { |
80 | | - throw new Error(`Request failed with status code: ${response.status}`) |
81 | | - } |
82 | | - |
83 | | - // Check content length |
84 | | - const contentLength = response.headers.get("content-length") |
85 | | - if (contentLength && parseInt(contentLength) > MAX_RESPONSE_SIZE) { |
86 | | - throw new Error("Response too large (exceeds 5MB limit)") |
87 | | - } |
| 12 | +const parameters = z.object({ |
| 13 | + url: z.string().describe("The URL to fetch content from"), |
| 14 | + format: z |
| 15 | + .enum(["text", "markdown", "html"]) |
| 16 | + .default("markdown") |
| 17 | + .describe("The format to return the content in (text, markdown, or html). Defaults to markdown."), |
| 18 | + timeout: z.number().describe("Optional timeout in seconds (max 120)").optional(), |
| 19 | +}) |
88 | 20 |
|
89 | | - const arrayBuffer = await response.arrayBuffer() |
90 | | - if (arrayBuffer.byteLength > MAX_RESPONSE_SIZE) { |
91 | | - throw new Error("Response too large (exceeds 5MB limit)") |
92 | | - } |
| 21 | +export const WebFetchTool = Tool.defineEffect( |
| 22 | + "webfetch", |
| 23 | + Effect.gen(function* () { |
| 24 | + const http = yield* HttpClient.HttpClient |
| 25 | + const httpOk = HttpClient.filterStatusOk(http) |
| 26 | + |
| 27 | + return { |
| 28 | + description: DESCRIPTION, |
| 29 | + parameters, |
| 30 | + execute: (params: z.infer<typeof parameters>, ctx: Tool.Context) => |
| 31 | + Effect.gen(function* () { |
| 32 | + if (!params.url.startsWith("http://") && !params.url.startsWith("https://")) { |
| 33 | + throw new Error("URL must start with http:// or https://") |
| 34 | + } |
93 | 35 |
|
94 | | - const contentType = response.headers.get("content-type") || "" |
95 | | - const mime = contentType.split(";")[0]?.trim().toLowerCase() || "" |
96 | | - const title = `${params.url} (${contentType})` |
97 | | - |
98 | | - // Check if response is an image |
99 | | - const isImage = mime.startsWith("image/") && mime !== "image/svg+xml" && mime !== "image/vnd.fastbidsheet" |
100 | | - |
101 | | - if (isImage) { |
102 | | - const base64Content = Buffer.from(arrayBuffer).toString("base64") |
103 | | - return { |
104 | | - title, |
105 | | - output: "Image fetched successfully", |
106 | | - metadata: {}, |
107 | | - attachments: [ |
108 | | - { |
109 | | - type: "file", |
110 | | - mime, |
111 | | - url: `data:${mime};base64,${base64Content}`, |
112 | | - }, |
113 | | - ], |
114 | | - } |
115 | | - } |
| 36 | + yield* Effect.promise(() => |
| 37 | + ctx.ask({ |
| 38 | + permission: "webfetch", |
| 39 | + patterns: [params.url], |
| 40 | + always: ["*"], |
| 41 | + metadata: { |
| 42 | + url: params.url, |
| 43 | + format: params.format, |
| 44 | + timeout: params.timeout, |
| 45 | + }, |
| 46 | + }), |
| 47 | + ) |
| 48 | + |
| 49 | + const timeout = Math.min((params.timeout ?? DEFAULT_TIMEOUT / 1000) * 1000, MAX_TIMEOUT) |
| 50 | + |
| 51 | + // Build Accept header based on requested format with q parameters for fallbacks |
| 52 | + let acceptHeader = "*/*" |
| 53 | + switch (params.format) { |
| 54 | + case "markdown": |
| 55 | + acceptHeader = |
| 56 | + "text/markdown;q=1.0, text/x-markdown;q=0.9, text/plain;q=0.8, text/html;q=0.7, */*;q=0.1" |
| 57 | + break |
| 58 | + case "text": |
| 59 | + acceptHeader = "text/plain;q=1.0, text/markdown;q=0.9, text/html;q=0.8, */*;q=0.1" |
| 60 | + break |
| 61 | + case "html": |
| 62 | + acceptHeader = |
| 63 | + "text/html;q=1.0, application/xhtml+xml;q=0.9, text/plain;q=0.8, text/markdown;q=0.7, */*;q=0.1" |
| 64 | + break |
| 65 | + default: |
| 66 | + acceptHeader = |
| 67 | + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8" |
| 68 | + } |
| 69 | + const headers = { |
| 70 | + "User-Agent": |
| 71 | + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36", |
| 72 | + Accept: acceptHeader, |
| 73 | + "Accept-Language": "en-US,en;q=0.9", |
| 74 | + } |
116 | 75 |
|
117 | | - const content = new TextDecoder().decode(arrayBuffer) |
118 | | - |
119 | | - // Handle content based on requested format and actual content type |
120 | | - switch (params.format) { |
121 | | - case "markdown": |
122 | | - if (contentType.includes("text/html")) { |
123 | | - const markdown = convertHTMLToMarkdown(content) |
124 | | - return { |
125 | | - output: markdown, |
126 | | - title, |
127 | | - metadata: {}, |
| 76 | + const request = HttpClientRequest.get(params.url).pipe(HttpClientRequest.setHeaders(headers)) |
| 77 | + |
| 78 | + // Retry with honest UA if blocked by Cloudflare bot detection (TLS fingerprint mismatch) |
| 79 | + const response = yield* httpOk.execute(request).pipe( |
| 80 | + Effect.catchIf( |
| 81 | + (err) => |
| 82 | + err.reason._tag === "StatusCodeError" && |
| 83 | + err.reason.response.status === 403 && |
| 84 | + err.reason.response.headers["cf-mitigated"] === "challenge", |
| 85 | + () => |
| 86 | + httpOk.execute( |
| 87 | + HttpClientRequest.get(params.url).pipe( |
| 88 | + HttpClientRequest.setHeaders({ ...headers, "User-Agent": "opencode" }), |
| 89 | + ), |
| 90 | + ), |
| 91 | + ), |
| 92 | + Effect.timeoutOrElse({ duration: timeout, orElse: () => Effect.die(new Error("Request timed out")) }), |
| 93 | + ) |
| 94 | + |
| 95 | + // Check content length |
| 96 | + const contentLength = response.headers["content-length"] |
| 97 | + if (contentLength && parseInt(contentLength) > MAX_RESPONSE_SIZE) { |
| 98 | + throw new Error("Response too large (exceeds 5MB limit)") |
128 | 99 | } |
129 | | - } |
130 | | - return { |
131 | | - output: content, |
132 | | - title, |
133 | | - metadata: {}, |
134 | | - } |
135 | 100 |
|
136 | | - case "text": |
137 | | - if (contentType.includes("text/html")) { |
138 | | - const text = await extractTextFromHTML(content) |
139 | | - return { |
140 | | - output: text, |
141 | | - title, |
142 | | - metadata: {}, |
| 101 | + const arrayBuffer = yield* response.arrayBuffer |
| 102 | + if (arrayBuffer.byteLength > MAX_RESPONSE_SIZE) { |
| 103 | + throw new Error("Response too large (exceeds 5MB limit)") |
143 | 104 | } |
144 | | - } |
145 | | - return { |
146 | | - output: content, |
147 | | - title, |
148 | | - metadata: {}, |
149 | | - } |
150 | 105 |
|
151 | | - case "html": |
152 | | - return { |
153 | | - output: content, |
154 | | - title, |
155 | | - metadata: {}, |
156 | | - } |
| 106 | + const contentType = response.headers["content-type"] || "" |
| 107 | + const mime = contentType.split(";")[0]?.trim().toLowerCase() || "" |
| 108 | + const title = `${params.url} (${contentType})` |
| 109 | + |
| 110 | + // Check if response is an image |
| 111 | + const isImage = mime.startsWith("image/") && mime !== "image/svg+xml" && mime !== "image/vnd.fastbidsheet" |
| 112 | + |
| 113 | + if (isImage) { |
| 114 | + const base64Content = Buffer.from(arrayBuffer).toString("base64") |
| 115 | + return { |
| 116 | + title, |
| 117 | + output: "Image fetched successfully", |
| 118 | + metadata: {}, |
| 119 | + attachments: [ |
| 120 | + { |
| 121 | + type: "file" as const, |
| 122 | + mime, |
| 123 | + url: `data:${mime};base64,${base64Content}`, |
| 124 | + }, |
| 125 | + ], |
| 126 | + } |
| 127 | + } |
157 | 128 |
|
158 | | - default: |
159 | | - return { |
160 | | - output: content, |
161 | | - title, |
162 | | - metadata: {}, |
163 | | - } |
| 129 | + const content = new TextDecoder().decode(arrayBuffer) |
| 130 | + |
| 131 | + // Handle content based on requested format and actual content type |
| 132 | + switch (params.format) { |
| 133 | + case "markdown": |
| 134 | + if (contentType.includes("text/html")) { |
| 135 | + const markdown = convertHTMLToMarkdown(content) |
| 136 | + return { |
| 137 | + output: markdown, |
| 138 | + title, |
| 139 | + metadata: {}, |
| 140 | + } |
| 141 | + } |
| 142 | + return { output: content, title, metadata: {} } |
| 143 | + |
| 144 | + case "text": |
| 145 | + if (contentType.includes("text/html")) { |
| 146 | + const text = yield* Effect.promise(() => extractTextFromHTML(content)) |
| 147 | + return { output: text, title, metadata: {} } |
| 148 | + } |
| 149 | + return { output: content, title, metadata: {} } |
| 150 | + |
| 151 | + case "html": |
| 152 | + return { output: content, title, metadata: {} } |
| 153 | + |
| 154 | + default: |
| 155 | + return { output: content, title, metadata: {} } |
| 156 | + } |
| 157 | + }).pipe(Effect.runPromise), |
164 | 158 | } |
165 | | - }, |
166 | | -}) |
| 159 | + }), |
| 160 | +) |
167 | 161 |
|
168 | 162 | async function extractTextFromHTML(html: string) { |
169 | 163 | let text = "" |
|
0 commit comments