Crawl
Crawl a site and aggregate per-page scraping outputs
Create Crawl task
Start a site crawl job. The job runs asynchronously and returns a job_id for polling.
Authorization
AuthorizationRequiredBearer <token>JWT token for API authentication
In: header
Request Body
application/jsonRequiredurlRequiredstringSeed URL to start crawling
"uri"engineRequiredstringThe scraping engine used for each crawled page
"playwright" | "cheerio" | "puppeteer"proxystring"uri"formatsarray<string>["markdown"]timeoutnumber300000Minimum: 1000Maximum: 600000wait_fornumber1Maximum: 60000retrybooleanfalseinclude_tagsarray<string>exclude_tagsarray<string>json_optionsobjectextract_sourcestring"markdown"Value in: "html" | "markdown"scrape_optionsobjectPer-page scraping options applied during crawling
exclude_pathsarray<string>Glob patterns or path prefixes to exclude from crawling
include_pathsarray<string>Glob patterns or path prefixes to include for crawling (applied after exclusion rules)
scrape_pathsarray<string>Glob patterns or path prefixes for content extraction. Only URLs matching these patterns will have content extracted and saved. If not specified, all included URLs will be scraped (default behavior)
max_depthnumberMaximum crawl depth from the seed URL
10Minimum: 1Maximum: 50strategystringCrawl scope strategy
"same-domain"Value in: "all" | "same-domain" | "same-hostname" | "same-origin"limitnumberMaximum number of pages to crawl
100Minimum: 1Maximum: 50000Response Body
Crawl job creation response (HTTP 200)
TypeScript Definitions
Use the response body type in TypeScript.
successRequiredbooleanIndicates the crawl job was accepted and queued
truedataRequiredobjectStandard error response format for validation errors
TypeScript Definitions
Use the response body type in TypeScript.
successRequiredbooleanIndicates the request failed
falseerrorRequiredstringError message
detailsRequiredobjectValidation error details
Unauthorized response format for authentication errors
TypeScript Definitions
Use the response body type in TypeScript.
successRequiredbooleanIndicates the request failed due to authentication issues
falseerrorRequiredstringAuthentication error message
Payment required response format with credit information
TypeScript Definitions
Use the response body type in TypeScript.
successRequiredbooleanIndicates the request failed due to insufficient credits
falseerrorRequiredstringError message
current_creditsRequirednumberCurrent credit balance of the user
Internal server error response format
TypeScript Definitions
Use the response body type in TypeScript.
successRequiredbooleanIndicates the request failed due to server error
falseerrorRequiredstringServer error message
messageRequiredstringDetailed error message describing what went wrong
curl -X POST "https://api.anycrawl.dev/v1/crawl" \
-H "Authorization: Bearer <token>" \
-H "Content-Type: application/json" \
-d '{
"url": "https://anycrawl.dev",
"engine": "cheerio"
}'const body = JSON.stringify({
"url": "https://anycrawl.dev",
"engine": "cheerio"
})
fetch("https://api.anycrawl.dev/v1/crawl", {
headers: {
"Authorization": "Bearer <token>"
},
body
})package main
import (
"fmt"
"net/http"
"io/ioutil"
"strings"
)
func main() {
url := "https://api.anycrawl.dev/v1/crawl"
body := strings.NewReader(`{
"url": "https://anycrawl.dev",
"engine": "cheerio"
}`)
req, _ := http.NewRequest("POST", url, body)
req.Header.Add("Authorization", "Bearer <token>")
req.Header.Add("Content-Type", "application/json")
res, _ := http.DefaultClient.Do(req)
defer res.Body.Close()
body, _ := ioutil.ReadAll(res.Body)
fmt.Println(res)
fmt.Println(string(body))
}import requests
url = "https://api.anycrawl.dev/v1/crawl"
body = {
"url": "https://anycrawl.dev",
"engine": "cheerio"
}
response = requests.request("POST", url, json = body, headers = {
"Authorization": "Bearer <token>",
"Content-Type": "application/json"
})
print(response.text){
"success": true,
"data": {
"job_id": "7a2e165d-8f81-4be6-9ef7-23222330a396",
"status": "created",
"message": "Crawl job has been queued for processing"
}
}{
"success": false,
"error": "Validation error",
"details": {
"issues": [
{
"field": "engine",
"message": "Invalid enum value. Expected 'playwright' | 'cheerio' | 'puppeteer', received 'cheeri1o'",
"code": "invalid_enum_value"
}
],
"messages": [
"Invalid enum value. Expected 'playwright' | 'cheerio' | 'puppeteer', received 'cheeri1o'"
]
}
}{
"success": false,
"error": "Invalid API key"
}{
"success": false,
"error": "Insufficient credits",
"current_credits": -2
}{
"success": false,
"error": "Internal server error",
"message": "Job 0ae56ed9-d9a9-4998-aea9-2ff5b51b2e4e timed out after 30000 seconds"
}Check Crawl status
Get the current status of a crawl job
Authorization
AuthorizationRequiredBearer <token>JWT token for API authentication
In: header
Path Parameters
jobIdRequiredstringThe crawl job ID
"uuid"Response Body
Crawl job status response (HTTP 200)
TypeScript Definitions
Use the response body type in TypeScript.
successRequiredbooleantruemessageRequiredstringStatus message
dataRequiredobjectStandard error response format for validation errors
TypeScript Definitions
Use the response body type in TypeScript.
successRequiredbooleanIndicates the request failed
falseerrorRequiredstringError message
detailsRequiredobjectValidation error details
Unauthorized response format for authentication errors
TypeScript Definitions
Use the response body type in TypeScript.
successRequiredbooleanIndicates the request failed due to authentication issues
falseerrorRequiredstringAuthentication error message
Internal server error response format
TypeScript Definitions
Use the response body type in TypeScript.
successRequiredbooleanIndicates the request failed due to server error
falseerrorRequiredstringServer error message
messageRequiredstringDetailed error message describing what went wrong
curl -X GET "https://api.anycrawl.dev/v1/crawl/497f6eca-6276-4993-bfeb-53cbbbba6f08/status" \
-H "Authorization: Bearer <token>"fetch("https://api.anycrawl.dev/v1/crawl/497f6eca-6276-4993-bfeb-53cbbbba6f08/status", {
headers: {
"Authorization": "Bearer <token>"
}
})package main
import (
"fmt"
"net/http"
"io/ioutil"
)
func main() {
url := "https://api.anycrawl.dev/v1/crawl/497f6eca-6276-4993-bfeb-53cbbbba6f08/status"
req, _ := http.NewRequest("GET", url, nil)
req.Header.Add("Authorization", "Bearer <token>")
res, _ := http.DefaultClient.Do(req)
defer res.Body.Close()
body, _ := ioutil.ReadAll(res.Body)
fmt.Println(res)
fmt.Println(string(body))
}import requests
url = "https://api.anycrawl.dev/v1/crawl/497f6eca-6276-4993-bfeb-53cbbbba6f08/status"
response = requests.request("GET", url, headers = {
"Authorization": "Bearer <token>"
})
print(response.text){
"success": true,
"message": "Job status retrieved successfully",
"data": {
"job_id": "453bd7d7-5355-4d6d-a38e-d9e7eb218c3f",
"status": "pending",
"start_time": "2025-05-25T07:56:44.162Z",
"expires_at": "2025-05-26T07:56:44.162Z",
"credits_used": 0,
"total": 120,
"completed": 30,
"failed": 2
}
}{
"success": false,
"error": "Validation error",
"details": {
"issues": [
{
"field": "engine",
"message": "Invalid enum value. Expected 'playwright' | 'cheerio' | 'puppeteer', received 'cheeri1o'",
"code": "invalid_enum_value"
}
],
"messages": [
"Invalid enum value. Expected 'playwright' | 'cheerio' | 'puppeteer', received 'cheeri1o'"
]
}
}{
"success": false,
"error": "Invalid API key"
}{
"success": false,
"error": "Internal server error",
"message": "Job 0ae56ed9-d9a9-4998-aea9-2ff5b51b2e4e timed out after 30000 seconds"
}Get Crawl results
Get crawl results (paginated via skip query param).
Authorization
AuthorizationRequiredBearer <token>JWT token for API authentication
In: header
Path Parameters
jobIdRequiredstringThe crawl job ID
"uuid"Query Parameters
skipintegerNumber of results to skip (page offset)
0Response Body
Crawl job results (paginated) response (HTTP 200)
TypeScript Definitions
Use the response body type in TypeScript.
successRequiredbooleantruestatusRequiredstring"pending" | "completed" | "failed" | "cancelled"totalRequirednumbercompletedRequirednumbercreditsUsedRequirednumbernextstring | null | nullNext page URL if more results are available
"uri"dataRequiredarray<unknown>Array of per-page scraping results produced by the crawl
Standard error response format for validation errors
TypeScript Definitions
Use the response body type in TypeScript.
successRequiredbooleanIndicates the request failed
falseerrorRequiredstringError message
detailsRequiredobjectValidation error details
Unauthorized response format for authentication errors
TypeScript Definitions
Use the response body type in TypeScript.
successRequiredbooleanIndicates the request failed due to authentication issues
falseerrorRequiredstringAuthentication error message
Internal server error response format
TypeScript Definitions
Use the response body type in TypeScript.
successRequiredbooleanIndicates the request failed due to server error
falseerrorRequiredstringServer error message
messageRequiredstringDetailed error message describing what went wrong
curl -X GET "https://api.anycrawl.dev/v1/crawl/497f6eca-6276-4993-bfeb-53cbbbba6f08?skip=0" \
-H "Authorization: Bearer <token>"fetch("https://api.anycrawl.dev/v1/crawl/497f6eca-6276-4993-bfeb-53cbbbba6f08?skip=0", {
headers: {
"Authorization": "Bearer <token>"
}
})package main
import (
"fmt"
"net/http"
"io/ioutil"
)
func main() {
url := "https://api.anycrawl.dev/v1/crawl/497f6eca-6276-4993-bfeb-53cbbbba6f08?skip=0"
req, _ := http.NewRequest("GET", url, nil)
req.Header.Add("Authorization", "Bearer <token>")
res, _ := http.DefaultClient.Do(req)
defer res.Body.Close()
body, _ := ioutil.ReadAll(res.Body)
fmt.Println(res)
fmt.Println(string(body))
}import requests
url = "https://api.anycrawl.dev/v1/crawl/497f6eca-6276-4993-bfeb-53cbbbba6f08?skip=0"
response = requests.request("GET", url, headers = {
"Authorization": "Bearer <token>"
})
print(response.text){
"success": true,
"status": "pending",
"total": 120,
"completed": 30,
"creditsUsed": 12,
"next": "https://api.anycrawl.dev/v1/crawl/7a2e165d-8f81-4be6-9ef7-23222330a396?skip=100",
"data": [
null
]
}{
"success": false,
"error": "Validation error",
"details": {
"issues": [
{
"field": "engine",
"message": "Invalid enum value. Expected 'playwright' | 'cheerio' | 'puppeteer', received 'cheeri1o'",
"code": "invalid_enum_value"
}
],
"messages": [
"Invalid enum value. Expected 'playwright' | 'cheerio' | 'puppeteer', received 'cheeri1o'"
]
}
}{
"success": false,
"error": "Invalid API key"
}{
"success": false,
"error": "Internal server error",
"message": "Job 0ae56ed9-d9a9-4998-aea9-2ff5b51b2e4e timed out after 30000 seconds"
}Cancel crawl
Cancel a pending crawl job
Authorization
AuthorizationRequiredBearer <token>JWT token for API authentication
In: header
Path Parameters
jobIdRequiredstringThe crawl job ID
"uuid"Response Body
Crawl job cancellation response (HTTP 200)
TypeScript Definitions
Use the response body type in TypeScript.
successRequiredbooleantruemessageRequiredstringdataRequiredobjectStandard error response format for validation errors
TypeScript Definitions
Use the response body type in TypeScript.
successRequiredbooleanIndicates the request failed
falseerrorRequiredstringError message
detailsRequiredobjectValidation error details
Unauthorized response format for authentication errors
TypeScript Definitions
Use the response body type in TypeScript.
successRequiredbooleanIndicates the request failed due to authentication issues
falseerrorRequiredstringAuthentication error message
Internal server error response format
TypeScript Definitions
Use the response body type in TypeScript.
successRequiredbooleanIndicates the request failed due to server error
falseerrorRequiredstringServer error message
messageRequiredstringDetailed error message describing what went wrong
curl -X DELETE "https://api.anycrawl.dev/v1/crawl/497f6eca-6276-4993-bfeb-53cbbbba6f08" \
-H "Authorization: Bearer <token>"fetch("https://api.anycrawl.dev/v1/crawl/497f6eca-6276-4993-bfeb-53cbbbba6f08", {
headers: {
"Authorization": "Bearer <token>"
}
})package main
import (
"fmt"
"net/http"
"io/ioutil"
)
func main() {
url := "https://api.anycrawl.dev/v1/crawl/497f6eca-6276-4993-bfeb-53cbbbba6f08"
req, _ := http.NewRequest("DELETE", url, nil)
req.Header.Add("Authorization", "Bearer <token>")
res, _ := http.DefaultClient.Do(req)
defer res.Body.Close()
body, _ := ioutil.ReadAll(res.Body)
fmt.Println(res)
fmt.Println(string(body))
}import requests
url = "https://api.anycrawl.dev/v1/crawl/497f6eca-6276-4993-bfeb-53cbbbba6f08"
response = requests.request("DELETE", url, headers = {
"Authorization": "Bearer <token>"
})
print(response.text){
"success": true,
"message": "Job cancelled successfully",
"data": {
"job_id": "453bd7d7-5355-4d6d-a38e-d9e7eb218c3f",
"status": "cancelled"
}
}{
"success": false,
"error": "Validation error",
"details": {
"issues": [
{
"field": "engine",
"message": "Invalid enum value. Expected 'playwright' | 'cheerio' | 'puppeteer', received 'cheeri1o'",
"code": "invalid_enum_value"
}
],
"messages": [
"Invalid enum value. Expected 'playwright' | 'cheerio' | 'puppeteer', received 'cheeri1o'"
]
}
}{
"success": false,
"error": "Invalid API key"
}{
"success": false,
"error": "Internal server error",
"message": "Job 0ae56ed9-d9a9-4998-aea9-2ff5b51b2e4e timed out after 30000 seconds"
}