From 9c6e01686b8a1bc74de8c80f731f0e7e45dd0e5e Mon Sep 17 00:00:00 2001 From: nicolas145 Date: Thu, 25 Sep 2025 11:35:39 -0400 Subject: [PATCH 01/21] fix: add scraping strategy for pdf --- crawl4ai/__init__.py | 5 ++ deploy/docker/api.py | 40 ++++++++++++-- deploy/docker/crawler_pool.py | 28 +++++++--- deploy/docker/utils.py | 31 ++++++++++- docs/examples/docker/demo_docker_api.py | 69 +++++++++++++++++++++++++ 5 files changed, 160 insertions(+), 13 deletions(-) diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 6917f27e9..a6a1419af 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -10,6 +10,9 @@ LXMLWebScrapingStrategy, WebScrapingStrategy, # Backward compatibility alias ) + +from .processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy + from .async_logger import ( AsyncLoggerBase, AsyncLogger, @@ -128,6 +131,8 @@ "BFSDeepCrawlStrategy", "BestFirstCrawlingStrategy", "DFSDeepCrawlStrategy", + "PDFCrawlerStrategy", + "PDFContentScrapingStrategy", "FilterChain", "URLPatternFilter", "ContentTypeFilter", diff --git a/deploy/docker/api.py b/deploy/docker/api.py index 58d8c01fe..6b2b31468 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -13,8 +13,12 @@ from fastapi import HTTPException, Request, status from fastapi.background import BackgroundTasks from fastapi.responses import JSONResponse +from fastapi.encoders import jsonable_encoder + from redis import asyncio as aioredis +from utils import is_pdf_url + from crawl4ai import ( AsyncWebCrawler, CrawlerRunConfig, @@ -31,6 +35,10 @@ BM25ContentFilter, LLMContentFilter ) + +from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy +from crawl4ai.async_configs import to_serializable_dict + from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy @@ -431,6 +439,18 @@ async def handle_crawl_request( urls = [('https://' + url) if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")) else url for url in urls] browser_config = BrowserConfig.load(browser_config) crawler_config = CrawlerRunConfig.load(crawler_config) + + is_pdf_flags = await asyncio.gather(*(is_pdf_url(url) for url in urls)) + is_pdf = any(is_pdf_flags) + crawler_strategy = PDFCrawlerStrategy() if is_pdf else None + + if is_pdf and crawler_config.scraping_strategy is None: + # Default strategy if not set + crawler_config.scraping_strategy = PDFContentScrapingStrategy( + extract_images=False, + save_images_locally=False, + batch_size=2 + ) dispatcher = MemoryAdaptiveDispatcher( memory_threshold_percent=config["crawler"]["memory_threshold_percent"], @@ -440,7 +460,7 @@ async def handle_crawl_request( ) from crawler_pool import get_crawler - crawler = await get_crawler(browser_config) + crawler = await get_crawler(browser_config, crawler_strategy) # crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config) # await crawler.start() @@ -476,7 +496,8 @@ async def handle_crawl_request( # If PDF exists, encode it to base64 if result_dict.get('pdf') is not None: result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8') - processed_results.append(result_dict) + + processed_results.append(to_serializable_dict(result_dict)) return { "success": True, @@ -521,8 +542,19 @@ async def handle_stream_crawl_request( # browser_config.verbose = True # Set to False or remove for production stress testing browser_config.verbose = False crawler_config = CrawlerRunConfig.load(crawler_config) - crawler_config.scraping_strategy = LXMLWebScrapingStrategy() crawler_config.stream = True + + is_pdf_flags = await asyncio.gather(*(is_pdf_url(url) for url in urls)) + is_pdf = any(is_pdf_flags) + crawler_strategy = PDFCrawlerStrategy() if is_pdf else None + + if is_pdf and crawler_config.scraping_strategy is None: + # Default strategy if not set + crawler_config.scraping_strategy = PDFContentScrapingStrategy( + extract_images=True, + save_images_locally=False, + batch_size=2 + ) dispatcher = MemoryAdaptiveDispatcher( memory_threshold_percent=config["crawler"]["memory_threshold_percent"], @@ -532,7 +564,7 @@ async def handle_stream_crawl_request( ) from crawler_pool import get_crawler - crawler = await get_crawler(browser_config) + crawler = await get_crawler(browser_config, crawler_strategy) # crawler = AsyncWebCrawler(config=browser_config) # await crawler.start() diff --git a/deploy/docker/crawler_pool.py b/deploy/docker/crawler_pool.py index d15102e4d..82ba30b71 100644 --- a/deploy/docker/crawler_pool.py +++ b/deploy/docker/crawler_pool.py @@ -1,11 +1,11 @@ # crawler_pool.py (new file) import asyncio, json, hashlib, time, psutil from contextlib import suppress -from typing import Dict +from typing import Dict, Optional from crawl4ai import AsyncWebCrawler, BrowserConfig -from typing import Dict from utils import load_config + CONFIG = load_config() POOL: Dict[str, AsyncWebCrawler] = {} @@ -15,20 +15,31 @@ MEM_LIMIT = CONFIG.get("crawler", {}).get("memory_threshold_percent", 95.0) # % RAM – refuse new browsers above this IDLE_TTL = CONFIG.get("crawler", {}).get("pool", {}).get("idle_ttl_sec", 1800) # close if unused for 30 min -def _sig(cfg: BrowserConfig) -> str: - payload = json.dumps(cfg.to_dict(), sort_keys=True, separators=(",",":")) - return hashlib.sha1(payload.encode()).hexdigest() +def _sig(cfg: BrowserConfig, crawler_strategy: Optional[object] = None) -> str: + """ + Generate a unique signature for a crawler based on browser config + and optional crawler strategy. This ensures that crawlers with + different strategies (e.g., PDF) are stored separately in the pool. + """ + payload = cfg.to_dict() + + if crawler_strategy is not None: + payload["strategy"] = crawler_strategy.__class__.__name__ + + json_payload = json.dumps(payload, sort_keys=True, separators=(",", ":")) + return hashlib.sha1(json_payload.encode()).hexdigest() + -async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler: +async def get_crawler(cfg: BrowserConfig, crawler_strategy: Optional[object] = None) -> AsyncWebCrawler: try: - sig = _sig(cfg) + sig = _sig(cfg, crawler_strategy=crawler_strategy) async with LOCK: if sig in POOL: LAST_USED[sig] = time.time(); return POOL[sig] if psutil.virtual_memory().percent >= MEM_LIMIT: raise MemoryError("RAM pressure – new browser denied") - crawler = AsyncWebCrawler(config=cfg, thread_safe=False) + crawler = AsyncWebCrawler(config=cfg, thread_safe=False, crawler_strategy=crawler_strategy) await crawler.start() POOL[sig] = crawler; LAST_USED[sig] = time.time() return crawler @@ -44,6 +55,7 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler: POOL.pop(sig, None) LAST_USED.pop(sig, None) # If we failed to start the browser, we should remove it from the pool + async def close_all(): async with LOCK: await asyncio.gather(*(c.close() for c in POOL.values()), return_exceptions=True) diff --git a/deploy/docker/utils.py b/deploy/docker/utils.py index 2e2a80ac7..7da18fda9 100644 --- a/deploy/docker/utils.py +++ b/deploy/docker/utils.py @@ -2,6 +2,7 @@ import logging import yaml import os +import httpx from datetime import datetime from enum import Enum from pathlib import Path @@ -124,4 +125,32 @@ def verify_email_domain(email: str) -> bool: records = dns.resolver.resolve(domain, 'MX') return True if records else False except Exception as e: - return False \ No newline at end of file + return False + +async def is_pdf_url(url: str) -> bool: + """ + Check if a URL points to a PDF using httpx: + - Check extension + - Check Content-Type via HEAD request + - Check first 5 bytes (magic number) if needed + """ + if url.lower().endswith(".pdf"): + return True + + try: + async with httpx.AsyncClient(follow_redirects=True) as client: + # HEAD request to check Content-Type + head_resp = await client.head(url) + content_type = head_resp.headers.get("content-type", "").lower() + if "application/pdf" in content_type: + return True + + # Fallback: GET first 5 bytes to check PDF magic number + get_resp = await client.get(url, headers={"Range": "bytes=0-4"}) + if get_resp.status_code in (200, 206): # 206 Partial Content + return get_resp.content.startswith(b"%PDF-") + except Exception: + return False + + return False + diff --git a/docs/examples/docker/demo_docker_api.py b/docs/examples/docker/demo_docker_api.py index 0a3d51af1..f13e341e5 100644 --- a/docs/examples/docker/demo_docker_api.py +++ b/docs/examples/docker/demo_docker_api.py @@ -27,6 +27,7 @@ FORMS_URL = "https://httpbin.org/forms/post" # For JS demo BOOKS_URL = "http://books.toscrape.com/" # For CSS extraction PYTHON_URL = "https://python.org" # For deeper crawl +PDF_URL = "https://arxiv.org/pdf/2310.06825" # For PDF demo # Use the same sample site as deep crawl tests for consistency DEEP_CRAWL_BASE_URL = os.getenv( "DEEP_CRAWL_TEST_SITE", "https://docs.crawl4ai.com/samples/deepcrawl/") @@ -1261,6 +1262,71 @@ async def demo_config_dump_invalid(client: httpx.AsyncClient): console.print( f"[bold red]Unexpected error during invalid test:[/] {e}") +# 10. Crawl PDF + +async def demo_pdf_crawl(client: httpx.AsyncClient): + payload = { + "urls": [PDF_URL], + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "cache_mode": "BYPASS", + "scraping_strategy": { + "type": "PDFContentScrapingStrategy", + "params": { + "extract_images": False, + "save_images_locally": False, + "batch_size": 2 + } + } + } + } + } + + resp = await client.post("/crawl", json=payload) + resp.raise_for_status() + data = resp.json() + print("=== Demo: PDF Crawl ===") + print("Success:", data.get("success")) + print("Number of results:", len(data.get("results", []))) + if data.get("results"): + first = data["results"][0] + text_snippet = (first.get("text") or "")[:500] + print("Extracted text (first 500 chars):") + print(text_snippet) + +# 11. Crawl PDF stream + +async def demo_pdf_crawl_stream(client: httpx.AsyncClient): + """ + Demo: Crawl a PDF and stream the extracted text content. + """ + payload = { + "urls": [PDF_URL], + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "stream": True, + "cache_mode": "BYPASS", + "scraping_strategy": { # <-- Default strategy if not set + "type": "PDFContentScrapingStrategy", + "params": { + "extract_images": False, + "save_images_locally": False, + "batch_size": 2 + } + } + } + } + } + + await stream_request( + client, + "/crawl/stream", + payload, + "Demo PDF: Streaming PDF Crawl" + ) + # --- Update Main Runner to include new demo --- async def main_demo(): @@ -1294,6 +1360,9 @@ async def main_demo(): # await demo_deep_with_llm_extraction(client) # await demo_deep_with_proxy(client) # Skips if no PROXIES env var # await demo_deep_with_ssl(client) # Added the new demo + + # await demo_pdf_crawl_stream(client) + # await demo_pdf_crawl(client) # --- Helper endpoints --- await demo_markdown_endpoint(client) From 057fb61cf0d27075bb298162a776acdcb07f4fa9 Mon Sep 17 00:00:00 2001 From: nicolas145 Date: Thu, 25 Sep 2025 11:37:47 -0400 Subject: [PATCH 02/21] fix: typo --- deploy/docker/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/docker/api.py b/deploy/docker/api.py index 6b2b31468..9e88e28f7 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -445,7 +445,7 @@ async def handle_crawl_request( crawler_strategy = PDFCrawlerStrategy() if is_pdf else None if is_pdf and crawler_config.scraping_strategy is None: - # Default strategy if not set + # Default strategy if not set. crawler_config.scraping_strategy = PDFContentScrapingStrategy( extract_images=False, save_images_locally=False, From 5adc9dce066c481884b2c783881ce47ff72bb369 Mon Sep 17 00:00:00 2001 From: nicolas145 Date: Thu, 25 Sep 2025 12:18:47 -0400 Subject: [PATCH 03/21] fix: add test suits --- tests/docker/test_rest_api_pdf_crawl.py | 229 ++++++++++++++++++++++++ 1 file changed, 229 insertions(+) create mode 100644 tests/docker/test_rest_api_pdf_crawl.py diff --git a/tests/docker/test_rest_api_pdf_crawl.py b/tests/docker/test_rest_api_pdf_crawl.py new file mode 100644 index 000000000..20b4bd008 --- /dev/null +++ b/tests/docker/test_rest_api_pdf_crawl.py @@ -0,0 +1,229 @@ +# ==== File: test_rest_api_deep_crawl.py ==== + +import pytest +import pytest_asyncio +import httpx +import json +import asyncio +import os +from typing import List, Dict, Any, AsyncGenerator + +from dotenv import load_dotenv +load_dotenv() # Load environment variables from .env file if present + +# --- Test Configuration --- +BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # If server is running in Docker, use the host's IP +BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # If server is running in dev debug mode +PDF_TEST_URL = "https://arxiv.org/pdf/2310.06825" +PDF_TEST_INVALID_URL = "https://docs.crawl4ai.com/samples/deepcrawl/" + +# --- Helper Functions --- +def load_proxies_from_env() -> List[Dict]: + """Load proxies from PROXIES environment variable""" + proxies = [] + proxies_str = os.getenv("PROXIES", "") + if not proxies_str: + print("PROXIES environment variable not set or empty.") + return proxies + try: + proxy_list = proxies_str.split(",") + for proxy in proxy_list: + proxy = proxy.strip() + if not proxy: + continue + parts = proxy.split(":") + if len(parts) == 4: + ip, port, username, password = parts + proxies.append({ + "server": f"http://{ip}:{port}", # Assuming http, adjust if needed + "username": username, + "password": password, + "ip": ip # Store original IP if available + }) + elif len(parts) == 2: # ip:port only + ip, port = parts + proxies.append({ + "server": f"http://{ip}:{port}", + "ip": ip + }) + else: + print(f"Skipping invalid proxy string format: {proxy}") + + except Exception as e: + print(f"Error loading proxies from environment: {e}") + return proxies + + +async def check_server_health(client: httpx.AsyncClient): + """Check if the server is healthy before running tests.""" + try: + response = await client.get("/health") + response.raise_for_status() + print(f"\nServer healthy: {response.json()}") + return True + except (httpx.RequestError, httpx.HTTPStatusError) as e: + pytest.fail(f"Server health check failed: {e}. Is the server running at {BASE_URL}?", pytrace=False) + +async def assert_crawl_result_structure(result: Dict[str, Any], check_ssl=False): + """Asserts the basic structure of a single crawl result.""" + assert isinstance(result, dict) + assert "url" in result + assert "success" in result + assert "html" in result # Basic crawls should return HTML + assert "metadata" in result + assert isinstance(result["metadata"], dict) + assert "depth" in result["metadata"] # Deep crawls add depth + + if check_ssl: + assert "ssl_certificate" in result # Check if SSL info is present + assert isinstance(result["ssl_certificate"], dict) or result["ssl_certificate"] is None + + +async def process_streaming_response(response: httpx.Response) -> List[Dict[str, Any]]: + """Processes an NDJSON streaming response.""" + results = [] + completed = False + async for line in response.aiter_lines(): + if line: + try: + data = json.loads(line) + if data.get("status") == "completed": + completed = True + break # Stop processing after completion marker + elif data.get("url"): # Ensure it looks like a result object + results.append(data) + else: + print(f"Received non-result JSON line: {data}") # Log other status messages if needed + except json.JSONDecodeError: + pytest.fail(f"Failed to decode JSON line: {line}") + assert completed, "Streaming response did not end with a completion marker." + return results + + +# --- Pytest Fixtures --- +@pytest_asyncio.fixture(scope="function") +async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]: + """Provides an async HTTP client""" + # Increased timeout for potentially longer deep crawls + async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client: + yield client + # No explicit close needed with 'async with' + +# --- Test Class for PDF Scraping --- +@pytest.mark.asyncio +class TestPdfScraping: + + @pytest_asyncio.fixture(autouse=True) + async def check_health_before_tests(self, async_client: httpx.AsyncClient): + """Fixture to ensure server is healthy before each test in the class.""" + await check_server_health(async_client) + + async def test_pdf_scraping_basic(self, async_client: httpx.AsyncClient): + """Test basic PDF scraping for a single PDF URL.""" + payload = { + "urls": [PDF_TEST_URL], # URL of a test PDF + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "stream": False, + "cache_mode": "BYPASS", + "scraping_strategy": { + "type": "PdfScrapingStrategy", # Custom PDF scraping strategy + "params": {} + }, + "deep_crawl_strategy": { + "type": "BFSDeepCrawlStrategy", + "params": {"max_depth": 0, "max_pages": 1} + } + } + } + } + + response = await async_client.post("/crawl", json=payload) + response.raise_for_status() + data = response.json() + + assert data["success"] is True + assert len(data["results"]) == 1 + + result = data["results"][0] + await assert_crawl_result_structure(result) + assert result["success"] is True + assert "extracted_content" in result + assert result["extracted_content"] is not None + # Vérifier que le texte extrait est non vide + extracted_text = result["extracted_content"].get("text", "") + assert isinstance(extracted_text, str) + assert len(extracted_text) > 0 + + async def test_pdf_scraping_with_metadata(self, async_client: httpx.AsyncClient): + """Test PDF scraping with metadata extraction.""" + payload = { + "urls": [PDF_TEST_URL], + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "stream": False, + "cache_mode": "BYPASS", + "scraping_strategy": { + "type": "PdfScrapingStrategy", + "params": {"extract_metadata": True} # Param spécifique pour métadonnées + }, + "deep_crawl_strategy": { + "type": "BFSDeepCrawlStrategy", + "params": {"max_depth": 0, "max_pages": 1} + } + } + } + } + + response = await async_client.post("/crawl", json=payload) + response.raise_for_status() + data = response.json() + + assert data["success"] is True + result = data["results"][0] + assert "extracted_content" in result + metadata = result["extracted_content"].get("metadata", {}) + assert isinstance(metadata, dict) + # Vérification simple : titre et auteur peuvent exister + assert "title" in metadata or "author" in metadata + + async def test_pdf_scraping_non_accessible(self, async_client: httpx.AsyncClient): + """Test PDF scraping when PDF is not accessible.""" + payload = { + "urls": [PDF_TEST_INVALID_URL], # URL invalide + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "stream": False, + "cache_mode": "BYPASS", + "scraping_strategy": { + "type": "PdfScrapingStrategy", + "params": {} + }, + "deep_crawl_strategy": { + "type": "BFSDeepCrawlStrategy", + "params": {"max_depth": 0, "max_pages": 1} + } + } + } + } + + response = await async_client.post("/crawl", json=payload) + # Le serveur doit répondre OK mais le résultat doit indiquer échec + data = response.json() + assert data["success"] is True + result = data["results"][0] + assert result["success"] is False + assert "extracted_content" not in result or result["extracted_content"] is None + + +# --- Main Execution Block (for running script directly) --- +if __name__ == "__main__": + pytest_args = ["-v", "-s", __file__] + # Example: Run only proxy test + # pytest_args.append("-k test_deep_crawl_with_proxies") + print(f"Running pytest with args: {pytest_args}") + exit_code = pytest.main(pytest_args) + print(f"Pytest finished with exit code: {exit_code}") \ No newline at end of file From d231d618b1399143e6a4005018e8cb797f5b56a5 Mon Sep 17 00:00:00 2001 From: nicolas145 Date: Thu, 25 Sep 2025 12:23:43 -0400 Subject: [PATCH 04/21] fix: typo --- tests/docker/test_rest_api_pdf_crawl.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/docker/test_rest_api_pdf_crawl.py b/tests/docker/test_rest_api_pdf_crawl.py index 20b4bd008..8549ac622 100644 --- a/tests/docker/test_rest_api_pdf_crawl.py +++ b/tests/docker/test_rest_api_pdf_crawl.py @@ -1,4 +1,4 @@ -# ==== File: test_rest_api_deep_crawl.py ==== +# ==== File: test_rest_api_pdf_crawl.py ==== import pytest import pytest_asyncio @@ -151,7 +151,7 @@ async def test_pdf_scraping_basic(self, async_client: httpx.AsyncClient): assert result["success"] is True assert "extracted_content" in result assert result["extracted_content"] is not None - # Vérifier que le texte extrait est non vide + extracted_text = result["extracted_content"].get("text", "") assert isinstance(extracted_text, str) assert len(extracted_text) > 0 @@ -167,7 +167,7 @@ async def test_pdf_scraping_with_metadata(self, async_client: httpx.AsyncClient) "cache_mode": "BYPASS", "scraping_strategy": { "type": "PdfScrapingStrategy", - "params": {"extract_metadata": True} # Param spécifique pour métadonnées + "params": {"extract_metadata": True} }, "deep_crawl_strategy": { "type": "BFSDeepCrawlStrategy", @@ -186,13 +186,13 @@ async def test_pdf_scraping_with_metadata(self, async_client: httpx.AsyncClient) assert "extracted_content" in result metadata = result["extracted_content"].get("metadata", {}) assert isinstance(metadata, dict) - # Vérification simple : titre et auteur peuvent exister + assert "title" in metadata or "author" in metadata async def test_pdf_scraping_non_accessible(self, async_client: httpx.AsyncClient): """Test PDF scraping when PDF is not accessible.""" payload = { - "urls": [PDF_TEST_INVALID_URL], # URL invalide + "urls": [PDF_TEST_INVALID_URL], "crawler_config": { "type": "CrawlerRunConfig", "params": { @@ -211,7 +211,7 @@ async def test_pdf_scraping_non_accessible(self, async_client: httpx.AsyncClient } response = await async_client.post("/crawl", json=payload) - # Le serveur doit répondre OK mais le résultat doit indiquer échec + data = response.json() assert data["success"] is True result = data["results"][0] From c709082ae0d01f8dde54bf55ffd21d65c24036a1 Mon Sep 17 00:00:00 2001 From: nicolas145 Date: Thu, 25 Sep 2025 12:28:00 -0400 Subject: [PATCH 05/21] fix: remove unused code --- tests/docker/test_rest_api_pdf_crawl.py | 56 ------------------------- 1 file changed, 56 deletions(-) diff --git a/tests/docker/test_rest_api_pdf_crawl.py b/tests/docker/test_rest_api_pdf_crawl.py index 8549ac622..fd029f12c 100644 --- a/tests/docker/test_rest_api_pdf_crawl.py +++ b/tests/docker/test_rest_api_pdf_crawl.py @@ -18,41 +18,6 @@ PDF_TEST_INVALID_URL = "https://docs.crawl4ai.com/samples/deepcrawl/" # --- Helper Functions --- -def load_proxies_from_env() -> List[Dict]: - """Load proxies from PROXIES environment variable""" - proxies = [] - proxies_str = os.getenv("PROXIES", "") - if not proxies_str: - print("PROXIES environment variable not set or empty.") - return proxies - try: - proxy_list = proxies_str.split(",") - for proxy in proxy_list: - proxy = proxy.strip() - if not proxy: - continue - parts = proxy.split(":") - if len(parts) == 4: - ip, port, username, password = parts - proxies.append({ - "server": f"http://{ip}:{port}", # Assuming http, adjust if needed - "username": username, - "password": password, - "ip": ip # Store original IP if available - }) - elif len(parts) == 2: # ip:port only - ip, port = parts - proxies.append({ - "server": f"http://{ip}:{port}", - "ip": ip - }) - else: - print(f"Skipping invalid proxy string format: {proxy}") - - except Exception as e: - print(f"Error loading proxies from environment: {e}") - return proxies - async def check_server_health(client: httpx.AsyncClient): """Check if the server is healthy before running tests.""" @@ -79,27 +44,6 @@ async def assert_crawl_result_structure(result: Dict[str, Any], check_ssl=False) assert isinstance(result["ssl_certificate"], dict) or result["ssl_certificate"] is None -async def process_streaming_response(response: httpx.Response) -> List[Dict[str, Any]]: - """Processes an NDJSON streaming response.""" - results = [] - completed = False - async for line in response.aiter_lines(): - if line: - try: - data = json.loads(line) - if data.get("status") == "completed": - completed = True - break # Stop processing after completion marker - elif data.get("url"): # Ensure it looks like a result object - results.append(data) - else: - print(f"Received non-result JSON line: {data}") # Log other status messages if needed - except json.JSONDecodeError: - pytest.fail(f"Failed to decode JSON line: {line}") - assert completed, "Streaming response did not end with a completion marker." - return results - - # --- Pytest Fixtures --- @pytest_asyncio.fixture(scope="function") async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]: From 481e7fe21fb7f4c7dfdab611e22b79608aa65a11 Mon Sep 17 00:00:00 2001 From: nicolas145 Date: Thu, 25 Sep 2025 12:36:07 -0400 Subject: [PATCH 06/21] fix: remove broken test --- tests/docker/test_rest_api_pdf_crawl.py | 37 ++----------------------- 1 file changed, 2 insertions(+), 35 deletions(-) diff --git a/tests/docker/test_rest_api_pdf_crawl.py b/tests/docker/test_rest_api_pdf_crawl.py index fd029f12c..94328b20c 100644 --- a/tests/docker/test_rest_api_pdf_crawl.py +++ b/tests/docker/test_rest_api_pdf_crawl.py @@ -72,7 +72,7 @@ async def test_pdf_scraping_basic(self, async_client: httpx.AsyncClient): "stream": False, "cache_mode": "BYPASS", "scraping_strategy": { - "type": "PdfScrapingStrategy", # Custom PDF scraping strategy + "type": "PDFContentScrapingStrategy", # Custom PDF scraping strategy "params": {} }, "deep_crawl_strategy": { @@ -100,39 +100,6 @@ async def test_pdf_scraping_basic(self, async_client: httpx.AsyncClient): assert isinstance(extracted_text, str) assert len(extracted_text) > 0 - async def test_pdf_scraping_with_metadata(self, async_client: httpx.AsyncClient): - """Test PDF scraping with metadata extraction.""" - payload = { - "urls": [PDF_TEST_URL], - "crawler_config": { - "type": "CrawlerRunConfig", - "params": { - "stream": False, - "cache_mode": "BYPASS", - "scraping_strategy": { - "type": "PdfScrapingStrategy", - "params": {"extract_metadata": True} - }, - "deep_crawl_strategy": { - "type": "BFSDeepCrawlStrategy", - "params": {"max_depth": 0, "max_pages": 1} - } - } - } - } - - response = await async_client.post("/crawl", json=payload) - response.raise_for_status() - data = response.json() - - assert data["success"] is True - result = data["results"][0] - assert "extracted_content" in result - metadata = result["extracted_content"].get("metadata", {}) - assert isinstance(metadata, dict) - - assert "title" in metadata or "author" in metadata - async def test_pdf_scraping_non_accessible(self, async_client: httpx.AsyncClient): """Test PDF scraping when PDF is not accessible.""" payload = { @@ -143,7 +110,7 @@ async def test_pdf_scraping_non_accessible(self, async_client: httpx.AsyncClient "stream": False, "cache_mode": "BYPASS", "scraping_strategy": { - "type": "PdfScrapingStrategy", + "type": "PDFContentScrapingStrategy", "params": {} }, "deep_crawl_strategy": { From 726f41ada6409526f5af23194e151978bac16257 Mon Sep 17 00:00:00 2001 From: nicolas145 Date: Thu, 25 Sep 2025 12:37:44 -0400 Subject: [PATCH 07/21] fix:remove deep crawl form tests --- tests/docker/test_rest_api_pdf_crawl.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tests/docker/test_rest_api_pdf_crawl.py b/tests/docker/test_rest_api_pdf_crawl.py index 94328b20c..5710fde34 100644 --- a/tests/docker/test_rest_api_pdf_crawl.py +++ b/tests/docker/test_rest_api_pdf_crawl.py @@ -75,10 +75,6 @@ async def test_pdf_scraping_basic(self, async_client: httpx.AsyncClient): "type": "PDFContentScrapingStrategy", # Custom PDF scraping strategy "params": {} }, - "deep_crawl_strategy": { - "type": "BFSDeepCrawlStrategy", - "params": {"max_depth": 0, "max_pages": 1} - } } } } @@ -113,10 +109,6 @@ async def test_pdf_scraping_non_accessible(self, async_client: httpx.AsyncClient "type": "PDFContentScrapingStrategy", "params": {} }, - "deep_crawl_strategy": { - "type": "BFSDeepCrawlStrategy", - "params": {"max_depth": 0, "max_pages": 1} - } } } } From 47bd3928006f510762486cde526a5052112e96ab Mon Sep 17 00:00:00 2001 From: nicolas145 Date: Thu, 25 Sep 2025 12:48:07 -0400 Subject: [PATCH 08/21] fix: coderabbit recommendations --- deploy/docker/api.py | 12 ++++++++++++ deploy/docker/crawler_pool.py | 8 +++++--- deploy/docker/utils.py | 20 +++++++++++--------- tests/docker/test_rest_api_pdf_crawl.py | 6 ++++-- 4 files changed, 32 insertions(+), 14 deletions(-) diff --git a/deploy/docker/api.py b/deploy/docker/api.py index 9e88e28f7..88c9883b5 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -442,6 +442,12 @@ async def handle_crawl_request( is_pdf_flags = await asyncio.gather(*(is_pdf_url(url) for url in urls)) is_pdf = any(is_pdf_flags) + if any(is_pdf_flags) and not all(is_pdf_flags): + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Mix of PDF and non-PDF URLs in a single request is not supported yet." + ) + crawler_strategy = PDFCrawlerStrategy() if is_pdf else None if is_pdf and crawler_config.scraping_strategy is None: @@ -546,6 +552,12 @@ async def handle_stream_crawl_request( is_pdf_flags = await asyncio.gather(*(is_pdf_url(url) for url in urls)) is_pdf = any(is_pdf_flags) + if any(is_pdf_flags) and not all(is_pdf_flags): + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Mix of PDF and non-PDF URLs in a single request is not supported yet." + ) + crawler_strategy = PDFCrawlerStrategy() if is_pdf else None if is_pdf and crawler_config.scraping_strategy is None: diff --git a/deploy/docker/crawler_pool.py b/deploy/docker/crawler_pool.py index 82ba30b71..efc7076fc 100644 --- a/deploy/docker/crawler_pool.py +++ b/deploy/docker/crawler_pool.py @@ -31,6 +31,7 @@ def _sig(cfg: BrowserConfig, crawler_strategy: Optional[object] = None) -> str: async def get_crawler(cfg: BrowserConfig, crawler_strategy: Optional[object] = None) -> AsyncWebCrawler: + sig: Optional[str] = None try: sig = _sig(cfg, crawler_strategy=crawler_strategy) async with LOCK: @@ -48,12 +49,13 @@ async def get_crawler(cfg: BrowserConfig, crawler_strategy: Optional[object] = N except Exception as e: raise RuntimeError(f"Failed to start browser: {e}") finally: - if sig in POOL: + if sig and sig in POOL: LAST_USED[sig] = time.time() else: # If we failed to start the browser, we should remove it from the pool - POOL.pop(sig, None) - LAST_USED.pop(sig, None) + if sig: + POOL.pop(sig, None) + LAST_USED.pop(sig, None) # If we failed to start the browser, we should remove it from the pool async def close_all(): diff --git a/deploy/docker/utils.py b/deploy/docker/utils.py index 7da18fda9..da3a688f3 100644 --- a/deploy/docker/utils.py +++ b/deploy/docker/utils.py @@ -137,20 +137,22 @@ async def is_pdf_url(url: str) -> bool: if url.lower().endswith(".pdf"): return True - try: - async with httpx.AsyncClient(follow_redirects=True) as client: - # HEAD request to check Content-Type - head_resp = await client.head(url) + timeout = httpx.Timeout(connect=5.0, read=10.0, write=5.0) + async with httpx.AsyncClient(follow_redirects=True, timeout=timeout) as client: + # HEAD request to check Content-Type (ignore servers that reject HEAD) + try: + head_resp = await client.head(url, headers={"Accept": "*/*"}) content_type = head_resp.headers.get("content-type", "").lower() if "application/pdf" in content_type: return True + except httpx.HTTPError: + pass - # Fallback: GET first 5 bytes to check PDF magic number + # Fallback: GET first 5 bytes to check PDF magic number + try: get_resp = await client.get(url, headers={"Range": "bytes=0-4"}) if get_resp.status_code in (200, 206): # 206 Partial Content return get_resp.content.startswith(b"%PDF-") - except Exception: - return False - - return False + except httpx.HTTPError: + return False diff --git a/tests/docker/test_rest_api_pdf_crawl.py b/tests/docker/test_rest_api_pdf_crawl.py index 5710fde34..63bf77154 100644 --- a/tests/docker/test_rest_api_pdf_crawl.py +++ b/tests/docker/test_rest_api_pdf_crawl.py @@ -12,8 +12,10 @@ load_dotenv() # Load environment variables from .env file if present # --- Test Configuration --- -BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # If server is running in Docker, use the host's IP -BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # If server is running in dev debug mode +BASE_URL = os.getenv( + "CRAWL4AI_TEST_URL", + "http://localhost:11235", # Docker default; override via env for dev/debug (e.g., 8020) +) PDF_TEST_URL = "https://arxiv.org/pdf/2310.06825" PDF_TEST_INVALID_URL = "https://docs.crawl4ai.com/samples/deepcrawl/" From e17484f5769d56675671064c7037d8e771ff00d6 Mon Sep 17 00:00:00 2001 From: nicolas145 Date: Fri, 26 Sep 2025 09:07:16 -0400 Subject: [PATCH 09/21] test: avoid overriding url --- docs/examples/docker/demo_docker_api.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/examples/docker/demo_docker_api.py b/docs/examples/docker/demo_docker_api.py index f13e341e5..60b152f52 100644 --- a/docs/examples/docker/demo_docker_api.py +++ b/docs/examples/docker/demo_docker_api.py @@ -18,10 +18,8 @@ console = Console() # --- Configuration --- -BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # Target URLs -SIMPLE_URL = "https://example.com" # For demo purposes SIMPLE_URL = "https://httpbin.org/html" LINKS_URL = "https://httpbin.org/links/10/0" FORMS_URL = "https://httpbin.org/forms/post" # For JS demo From 94de653c9164eebca11cc5ed5c872881dba2671c Mon Sep 17 00:00:00 2001 From: nicolas145 Date: Fri, 26 Sep 2025 09:39:13 -0400 Subject: [PATCH 10/21] fix: security & lint --- deploy/docker/crawler_pool.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/deploy/docker/crawler_pool.py b/deploy/docker/crawler_pool.py index efc7076fc..2af256563 100644 --- a/deploy/docker/crawler_pool.py +++ b/deploy/docker/crawler_pool.py @@ -27,7 +27,7 @@ def _sig(cfg: BrowserConfig, crawler_strategy: Optional[object] = None) -> str: payload["strategy"] = crawler_strategy.__class__.__name__ json_payload = json.dumps(payload, sort_keys=True, separators=(",", ":")) - return hashlib.sha1(json_payload.encode()).hexdigest() + return hashlib.sha256(json_payload.encode()).hexdigest() async def get_crawler(cfg: BrowserConfig, crawler_strategy: Optional[object] = None) -> AsyncWebCrawler: @@ -36,16 +36,16 @@ async def get_crawler(cfg: BrowserConfig, crawler_strategy: Optional[object] = N sig = _sig(cfg, crawler_strategy=crawler_strategy) async with LOCK: if sig in POOL: - LAST_USED[sig] = time.time(); + LAST_USED[sig] = time.time() return POOL[sig] if psutil.virtual_memory().percent >= MEM_LIMIT: - raise MemoryError("RAM pressure – new browser denied") + raise MemoryError("RAM pressure - new browser denied") crawler = AsyncWebCrawler(config=cfg, thread_safe=False, crawler_strategy=crawler_strategy) await crawler.start() POOL[sig] = crawler; LAST_USED[sig] = time.time() return crawler except MemoryError as e: - raise MemoryError(f"RAM pressure – new browser denied: {e}") + raise except Exception as e: raise RuntimeError(f"Failed to start browser: {e}") finally: From bc3b3d14ac7a0cf9879b2c389afc6d4361f6e38a Mon Sep 17 00:00:00 2001 From: nicolas145 Date: Fri, 26 Sep 2025 09:39:52 -0400 Subject: [PATCH 11/21] fix: use safe callback --- docs/examples/docker/demo_docker_api.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/docs/examples/docker/demo_docker_api.py b/docs/examples/docker/demo_docker_api.py index 60b152f52..a098fc4c9 100644 --- a/docs/examples/docker/demo_docker_api.py +++ b/docs/examples/docker/demo_docker_api.py @@ -25,7 +25,7 @@ FORMS_URL = "https://httpbin.org/forms/post" # For JS demo BOOKS_URL = "http://books.toscrape.com/" # For CSS extraction PYTHON_URL = "https://python.org" # For deeper crawl -PDF_URL = "https://arxiv.org/pdf/2310.06825" # For PDF demo +PDF_URL = "https://arxiv.org/pdf/2310.06825.pdf" # For PDF demo # Use the same sample site as deep crawl tests for consistency DEEP_CRAWL_BASE_URL = os.getenv( "DEEP_CRAWL_TEST_SITE", "https://docs.crawl4ai.com/samples/deepcrawl/") @@ -1289,9 +1289,12 @@ async def demo_pdf_crawl(client: httpx.AsyncClient): print("Number of results:", len(data.get("results", []))) if data.get("results"): first = data["results"][0] - text_snippet = (first.get("text") or "")[:500] + text = first.get("extracted_content") or first.get("text") or "" + if isinstance(text, dict): + text = text.get("text") or "" print("Extracted text (first 500 chars):") - print(text_snippet) + print((text or "")[:500]) + # 11. Crawl PDF stream @@ -1306,7 +1309,7 @@ async def demo_pdf_crawl_stream(client: httpx.AsyncClient): "params": { "stream": True, "cache_mode": "BYPASS", - "scraping_strategy": { # <-- Default strategy if not set + "scraping_strategy": { "type": "PDFContentScrapingStrategy", "params": { "extract_images": False, From 44b280b1c472c78cd8327a26510b81b2b4cc4263 Mon Sep 17 00:00:00 2001 From: nicolas145 Date: Fri, 26 Sep 2025 09:41:37 -0400 Subject: [PATCH 12/21] fix: guards and consistency --- tests/docker/test_rest_api_pdf_crawl.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/docker/test_rest_api_pdf_crawl.py b/tests/docker/test_rest_api_pdf_crawl.py index 63bf77154..6068a990b 100644 --- a/tests/docker/test_rest_api_pdf_crawl.py +++ b/tests/docker/test_rest_api_pdf_crawl.py @@ -16,7 +16,7 @@ "CRAWL4AI_TEST_URL", "http://localhost:11235", # Docker default; override via env for dev/debug (e.g., 8020) ) -PDF_TEST_URL = "https://arxiv.org/pdf/2310.06825" +PDF_TEST_URL = "https://arxiv.org/pdf/2310.06825.pdf" PDF_TEST_INVALID_URL = "https://docs.crawl4ai.com/samples/deepcrawl/" # --- Helper Functions --- @@ -94,9 +94,9 @@ async def test_pdf_scraping_basic(self, async_client: httpx.AsyncClient): assert "extracted_content" in result assert result["extracted_content"] is not None - extracted_text = result["extracted_content"].get("text", "") - assert isinstance(extracted_text, str) - assert len(extracted_text) > 0 + content = result.get("extracted_content") + extracted_text = content.get("text", "") if isinstance(content, dict) else (content or "") + assert isinstance(extracted_text, str) and len(extracted_text) > 0 async def test_pdf_scraping_non_accessible(self, async_client: httpx.AsyncClient): """Test PDF scraping when PDF is not accessible.""" From 21f80fffa5dccc259e335f04b476b9f1f457891a Mon Sep 17 00:00:00 2001 From: nicolas145 Date: Fri, 26 Sep 2025 09:42:51 -0400 Subject: [PATCH 13/21] fix: use jsonable_encoder --- deploy/docker/api.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/deploy/docker/api.py b/deploy/docker/api.py index 88c9883b5..4910bee18 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -37,7 +37,7 @@ ) from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy -from crawl4ai.async_configs import to_serializable_dict +# from crawl4ai.async_configs import to_serializable_dict from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy @@ -451,7 +451,6 @@ async def handle_crawl_request( crawler_strategy = PDFCrawlerStrategy() if is_pdf else None if is_pdf and crawler_config.scraping_strategy is None: - # Default strategy if not set. crawler_config.scraping_strategy = PDFContentScrapingStrategy( extract_images=False, save_images_locally=False, @@ -484,6 +483,7 @@ async def handle_crawl_request( config=crawler_config, dispatcher=dispatcher) results = await partial_func() + results_list = results if isinstance(results, list) else [results] # await crawler.close() @@ -497,13 +497,14 @@ async def handle_crawl_request( # Process results to handle PDF bytes processed_results = [] - for result in results: + for result in results_list: result_dict = result.model_dump() # If PDF exists, encode it to base64 if result_dict.get('pdf') is not None: result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8') - processed_results.append(to_serializable_dict(result_dict)) + # Keep response shape consistent with streaming (plain JSON-serializable dict) + processed_results.append(jsonable_encoder(result_dict)) return { "success": True, @@ -550,6 +551,9 @@ async def handle_stream_crawl_request( crawler_config = CrawlerRunConfig.load(crawler_config) crawler_config.stream = True + # Normalize URLs to include scheme (match non-streaming behavior) + urls = [('https://' + url) if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")) else url for url in urls] + is_pdf_flags = await asyncio.gather(*(is_pdf_url(url) for url in urls)) is_pdf = any(is_pdf_flags) if any(is_pdf_flags) and not all(is_pdf_flags): @@ -561,9 +565,8 @@ async def handle_stream_crawl_request( crawler_strategy = PDFCrawlerStrategy() if is_pdf else None if is_pdf and crawler_config.scraping_strategy is None: - # Default strategy if not set crawler_config.scraping_strategy = PDFContentScrapingStrategy( - extract_images=True, + extract_images=False, save_images_locally=False, batch_size=2 ) @@ -572,7 +575,7 @@ async def handle_stream_crawl_request( memory_threshold_percent=config["crawler"]["memory_threshold_percent"], rate_limiter=RateLimiter( base_delay=tuple(config["crawler"]["rate_limiter"]["base_delay"]) - ) + ) if config["crawler"]["rate_limiter"]["enabled"] else None ) from crawler_pool import get_crawler From 16363836bb75be5218160a0b497c1c12044ac85f Mon Sep 17 00:00:00 2001 From: nicolas145 Date: Fri, 26 Sep 2025 10:03:12 -0400 Subject: [PATCH 14/21] fix: timeout params --- deploy/docker/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/docker/utils.py b/deploy/docker/utils.py index da3a688f3..7cc487b35 100644 --- a/deploy/docker/utils.py +++ b/deploy/docker/utils.py @@ -137,7 +137,7 @@ async def is_pdf_url(url: str) -> bool: if url.lower().endswith(".pdf"): return True - timeout = httpx.Timeout(connect=5.0, read=10.0, write=5.0) + timeout = httpx.Timeout(5.0) async with httpx.AsyncClient(follow_redirects=True, timeout=timeout) as client: # HEAD request to check Content-Type (ignore servers that reject HEAD) try: From 056721a89ae54ff8a438820de8966b2fbed29902 Mon Sep 17 00:00:00 2001 From: nicolas145 Date: Fri, 26 Sep 2025 10:34:01 -0400 Subject: [PATCH 15/21] fix: http params --- deploy/docker/utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/deploy/docker/utils.py b/deploy/docker/utils.py index 7cc487b35..cf072b89a 100644 --- a/deploy/docker/utils.py +++ b/deploy/docker/utils.py @@ -137,7 +137,7 @@ async def is_pdf_url(url: str) -> bool: if url.lower().endswith(".pdf"): return True - timeout = httpx.Timeout(5.0) + timeout = httpx.Timeout(connect=5.0, read=10.0, write=5.0, pool=5.0) async with httpx.AsyncClient(follow_redirects=True, timeout=timeout) as client: # HEAD request to check Content-Type (ignore servers that reject HEAD) try: @@ -150,9 +150,12 @@ async def is_pdf_url(url: str) -> bool: # Fallback: GET first 5 bytes to check PDF magic number try: - get_resp = await client.get(url, headers={"Range": "bytes=0-4"}) + get_resp = await client.get(url, headers={"Range": "bytes=0-4", "Accept": "*/*"}) if get_resp.status_code in (200, 206): # 206 Partial Content return get_resp.content.startswith(b"%PDF-") except httpx.HTTPError: return False + + # Default: not a PDF (or unable to determine) + return False From ec2f88a1c1e61e2ec120b2d3d81a663e708bcc53 Mon Sep 17 00:00:00 2001 From: nicolas145 Date: Fri, 26 Sep 2025 10:34:11 -0400 Subject: [PATCH 16/21] fix: consistency --- docs/examples/docker/demo_docker_api.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/examples/docker/demo_docker_api.py b/docs/examples/docker/demo_docker_api.py index a098fc4c9..542fb8980 100644 --- a/docs/examples/docker/demo_docker_api.py +++ b/docs/examples/docker/demo_docker_api.py @@ -1265,6 +1265,7 @@ async def demo_config_dump_invalid(client: httpx.AsyncClient): async def demo_pdf_crawl(client: httpx.AsyncClient): payload = { "urls": [PDF_URL], + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "crawler_config": { "type": "CrawlerRunConfig", "params": { @@ -1304,6 +1305,7 @@ async def demo_pdf_crawl_stream(client: httpx.AsyncClient): """ payload = { "urls": [PDF_URL], + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "crawler_config": { "type": "CrawlerRunConfig", "params": { From 96815673f1f74179072a4eeddd34534c9a7bc10a Mon Sep 17 00:00:00 2001 From: nicolas145 Date: Fri, 26 Sep 2025 10:36:48 -0400 Subject: [PATCH 17/21] fix: missing check --- deploy/docker/crawler_pool.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/deploy/docker/crawler_pool.py b/deploy/docker/crawler_pool.py index 2af256563..8eb1ccec0 100644 --- a/deploy/docker/crawler_pool.py +++ b/deploy/docker/crawler_pool.py @@ -49,14 +49,13 @@ async def get_crawler(cfg: BrowserConfig, crawler_strategy: Optional[object] = N except Exception as e: raise RuntimeError(f"Failed to start browser: {e}") finally: - if sig and sig in POOL: - LAST_USED[sig] = time.time() - else: - # If we failed to start the browser, we should remove it from the pool - if sig: - POOL.pop(sig, None) - LAST_USED.pop(sig, None) - # If we failed to start the browser, we should remove it from the pool + async with LOCK: + if sig and sig in POOL: + LAST_USED[sig] = time.time() + else: + if sig: + POOL.pop(sig, None) + LAST_USED.pop(sig, None) async def close_all(): async with LOCK: From 87f35b88507e6b60c186811e6a8d8f1d1c135c54 Mon Sep 17 00:00:00 2001 From: nicolas145 Date: Fri, 26 Sep 2025 11:27:05 -0400 Subject: [PATCH 18/21] fix: dedupe concurrent creators --- deploy/docker/crawler_pool.py | 47 +++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 8 deletions(-) diff --git a/deploy/docker/crawler_pool.py b/deploy/docker/crawler_pool.py index 8eb1ccec0..4501466ea 100644 --- a/deploy/docker/crawler_pool.py +++ b/deploy/docker/crawler_pool.py @@ -30,25 +30,55 @@ def _sig(cfg: BrowserConfig, crawler_strategy: Optional[object] = None) -> str: return hashlib.sha256(json_payload.encode()).hexdigest() -async def get_crawler(cfg: BrowserConfig, crawler_strategy: Optional[object] = None) -> AsyncWebCrawler: +# Track in-flight creations to dedupe concurrent get_crawler() calls +CREATING: Dict[str, asyncio.Future] = {} + +async def get_crawler( + cfg: BrowserConfig, crawler_strategy: Optional[object] = None +) -> AsyncWebCrawler: sig: Optional[str] = None try: sig = _sig(cfg, crawler_strategy=crawler_strategy) + + # First pass under lock: reuse or join/create in-flight async with LOCK: if sig in POOL: LAST_USED[sig] = time.time() return POOL[sig] - if psutil.virtual_memory().percent >= MEM_LIMIT: - raise MemoryError("RAM pressure - new browser denied") - crawler = AsyncWebCrawler(config=cfg, thread_safe=False, crawler_strategy=crawler_strategy) - await crawler.start() - POOL[sig] = crawler; LAST_USED[sig] = time.time() - return crawler - except MemoryError as e: + fut = CREATING.get(sig) + if fut is None: + if psutil.virtual_memory().percent >= MEM_LIMIT: + raise MemoryError("RAM pressure - new browser denied") + fut = asyncio.get_running_loop().create_future() + CREATING[sig] = fut + + # Outside lock: create/start if we're the creator + if not fut.done(): + try: + crawler = AsyncWebCrawler( + config=cfg, thread_safe=False, crawler_strategy=crawler_strategy + ) + await crawler.start() + async with LOCK: + POOL[sig] = crawler + LAST_USED[sig] = time.time() + fut.set_result(crawler) + except Exception as e: + fut.set_exception(e) + raise RuntimeError("Failed to start browser") from e + finally: + async with LOCK: + CREATING.pop(sig, None) + + # Await the shared result (crawler or the same exception) + return await fut + + except MemoryError: raise except Exception as e: raise RuntimeError(f"Failed to start browser: {e}") finally: + # Update last-used if a crawler exists async with LOCK: if sig and sig in POOL: LAST_USED[sig] = time.time() @@ -56,6 +86,7 @@ async def get_crawler(cfg: BrowserConfig, crawler_strategy: Optional[object] = N if sig: POOL.pop(sig, None) LAST_USED.pop(sig, None) + async def close_all(): async with LOCK: From 2b80f98613b05f2ac1b998416fd0ff1da0ff378a Mon Sep 17 00:00:00 2001 From: nicolas145 Date: Fri, 26 Sep 2025 12:15:56 -0400 Subject: [PATCH 19/21] =?UTF-8?q?fix:=20dedup=20race:=20non=E2=80=91creato?= =?UTF-8?q?r=20callers=20may=20double=E2=80=91create=20and=20set=20the=20s?= =?UTF-8?q?ame=20Future?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- deploy/docker/__init__.py | 0 deploy/docker/crawler_pool.py | 24 ++++++++++++++++-------- 2 files changed, 16 insertions(+), 8 deletions(-) create mode 100644 deploy/docker/__init__.py diff --git a/deploy/docker/__init__.py b/deploy/docker/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/deploy/docker/crawler_pool.py b/deploy/docker/crawler_pool.py index 4501466ea..fa9beda5e 100644 --- a/deploy/docker/crawler_pool.py +++ b/deploy/docker/crawler_pool.py @@ -3,8 +3,7 @@ from contextlib import suppress from typing import Dict, Optional from crawl4ai import AsyncWebCrawler, BrowserConfig -from utils import load_config - +from .utils import load_config CONFIG = load_config() @@ -36,24 +35,34 @@ def _sig(cfg: BrowserConfig, crawler_strategy: Optional[object] = None) -> str: async def get_crawler( cfg: BrowserConfig, crawler_strategy: Optional[object] = None ) -> AsyncWebCrawler: + """ + Return a shared AsyncWebCrawler instance for the given config. + Only the 'creator' coroutine actually starts the crawler and sets + the future result to avoid InvalidStateError and double creation. + """ sig: Optional[str] = None try: sig = _sig(cfg, crawler_strategy=crawler_strategy) - # First pass under lock: reuse or join/create in-flight + creator = False # Track whether *this* caller will create the crawler async with LOCK: + # Reuse an existing crawler if available if sig in POOL: LAST_USED[sig] = time.time() return POOL[sig] + + # Join in-flight creation if it exists fut = CREATING.get(sig) if fut is None: + # First caller becomes the creator if psutil.virtual_memory().percent >= MEM_LIMIT: - raise MemoryError("RAM pressure - new browser denied") + raise MemoryError("RAM pressure – new browser denied") fut = asyncio.get_running_loop().create_future() CREATING[sig] = fut + creator = True - # Outside lock: create/start if we're the creator - if not fut.done(): + if creator: + # Only the creator actually creates/starts the crawler try: crawler = AsyncWebCrawler( config=cfg, thread_safe=False, crawler_strategy=crawler_strategy @@ -78,7 +87,7 @@ async def get_crawler( except Exception as e: raise RuntimeError(f"Failed to start browser: {e}") finally: - # Update last-used if a crawler exists + # Update last-used if a crawler now exists async with LOCK: if sig and sig in POOL: LAST_USED[sig] = time.time() @@ -86,7 +95,6 @@ async def get_crawler( if sig: POOL.pop(sig, None) LAST_USED.pop(sig, None) - async def close_all(): async with LOCK: From 703243dcb11343fd1d698e765d55aafb88f5e941 Mon Sep 17 00:00:00 2001 From: nicolas145 Date: Fri, 26 Sep 2025 14:29:43 -0400 Subject: [PATCH 20/21] Revert "fix: use safe callback" This reverts commit bc3b3d14ac7a0cf9879b2c389afc6d4361f6e38a. --- docs/examples/docker/demo_docker_api.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/docs/examples/docker/demo_docker_api.py b/docs/examples/docker/demo_docker_api.py index 542fb8980..a0898109b 100644 --- a/docs/examples/docker/demo_docker_api.py +++ b/docs/examples/docker/demo_docker_api.py @@ -25,7 +25,7 @@ FORMS_URL = "https://httpbin.org/forms/post" # For JS demo BOOKS_URL = "http://books.toscrape.com/" # For CSS extraction PYTHON_URL = "https://python.org" # For deeper crawl -PDF_URL = "https://arxiv.org/pdf/2310.06825.pdf" # For PDF demo +PDF_URL = "https://arxiv.org/pdf/2310.06825" # For PDF demo # Use the same sample site as deep crawl tests for consistency DEEP_CRAWL_BASE_URL = os.getenv( "DEEP_CRAWL_TEST_SITE", "https://docs.crawl4ai.com/samples/deepcrawl/") @@ -1290,12 +1290,9 @@ async def demo_pdf_crawl(client: httpx.AsyncClient): print("Number of results:", len(data.get("results", []))) if data.get("results"): first = data["results"][0] - text = first.get("extracted_content") or first.get("text") or "" - if isinstance(text, dict): - text = text.get("text") or "" + text_snippet = (first.get("text") or "")[:500] print("Extracted text (first 500 chars):") - print((text or "")[:500]) - + print(text_snippet) # 11. Crawl PDF stream @@ -1311,7 +1308,7 @@ async def demo_pdf_crawl_stream(client: httpx.AsyncClient): "params": { "stream": True, "cache_mode": "BYPASS", - "scraping_strategy": { + "scraping_strategy": { # <-- Default strategy if not set "type": "PDFContentScrapingStrategy", "params": { "extract_images": False, From 14994d88e93e622743a85cec1cec0ebf6b610bce Mon Sep 17 00:00:00 2001 From: nicolas145 Date: Fri, 26 Sep 2025 14:43:56 -0400 Subject: [PATCH 21/21] fix: revert to initial --- deploy/docker/__init__.py | 0 deploy/docker/crawler_pool.py | 77 +++++++++-------------------------- 2 files changed, 20 insertions(+), 57 deletions(-) delete mode 100644 deploy/docker/__init__.py diff --git a/deploy/docker/__init__.py b/deploy/docker/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/deploy/docker/crawler_pool.py b/deploy/docker/crawler_pool.py index fa9beda5e..4652c0d6d 100644 --- a/deploy/docker/crawler_pool.py +++ b/deploy/docker/crawler_pool.py @@ -3,7 +3,8 @@ from contextlib import suppress from typing import Dict, Optional from crawl4ai import AsyncWebCrawler, BrowserConfig -from .utils import load_config +from utils import load_config + CONFIG = load_config() @@ -29,72 +30,34 @@ def _sig(cfg: BrowserConfig, crawler_strategy: Optional[object] = None) -> str: return hashlib.sha256(json_payload.encode()).hexdigest() -# Track in-flight creations to dedupe concurrent get_crawler() calls -CREATING: Dict[str, asyncio.Future] = {} -async def get_crawler( - cfg: BrowserConfig, crawler_strategy: Optional[object] = None -) -> AsyncWebCrawler: - """ - Return a shared AsyncWebCrawler instance for the given config. - Only the 'creator' coroutine actually starts the crawler and sets - the future result to avoid InvalidStateError and double creation. - """ +async def get_crawler(cfg: BrowserConfig, crawler_strategy: Optional[object] = None) -> AsyncWebCrawler: sig: Optional[str] = None try: sig = _sig(cfg, crawler_strategy=crawler_strategy) - - creator = False # Track whether *this* caller will create the crawler async with LOCK: - # Reuse an existing crawler if available if sig in POOL: - LAST_USED[sig] = time.time() + LAST_USED[sig] = time.time(); return POOL[sig] - - # Join in-flight creation if it exists - fut = CREATING.get(sig) - if fut is None: - # First caller becomes the creator - if psutil.virtual_memory().percent >= MEM_LIMIT: - raise MemoryError("RAM pressure – new browser denied") - fut = asyncio.get_running_loop().create_future() - CREATING[sig] = fut - creator = True - - if creator: - # Only the creator actually creates/starts the crawler - try: - crawler = AsyncWebCrawler( - config=cfg, thread_safe=False, crawler_strategy=crawler_strategy - ) - await crawler.start() - async with LOCK: - POOL[sig] = crawler - LAST_USED[sig] = time.time() - fut.set_result(crawler) - except Exception as e: - fut.set_exception(e) - raise RuntimeError("Failed to start browser") from e - finally: - async with LOCK: - CREATING.pop(sig, None) - - # Await the shared result (crawler or the same exception) - return await fut - - except MemoryError: - raise + if psutil.virtual_memory().percent >= MEM_LIMIT: + raise MemoryError("RAM pressure – new browser denied") + crawler = AsyncWebCrawler(config=cfg, thread_safe=False, crawler_strategy=crawler_strategy) + await crawler.start() + POOL[sig] = crawler; LAST_USED[sig] = time.time() + return crawler + except MemoryError as e: + raise MemoryError(f"RAM pressure – new browser denied: {e}") except Exception as e: raise RuntimeError(f"Failed to start browser: {e}") finally: - # Update last-used if a crawler now exists - async with LOCK: - if sig and sig in POOL: - LAST_USED[sig] = time.time() - else: - if sig: - POOL.pop(sig, None) - LAST_USED.pop(sig, None) + if sig and sig in POOL: + LAST_USED[sig] = time.time() + else: + # If we failed to start the browser, we should remove it from the pool + if sig: + POOL.pop(sig, None) + LAST_USED.pop(sig, None) + # If we failed to start the browser, we should remove it from the pool async def close_all(): async with LOCK: