From 9c6e01686b8a1bc74de8c80f731f0e7e45dd0e5e Mon Sep 17 00:00:00 2001
From: nicolas145 <nicolas@sofy.fr>
Date: Thu, 25 Sep 2025 11:35:39 -0400
Subject: [PATCH 01/21] fix: add scraping strategy for pdf

---
 crawl4ai/__init__.py                    |  5 ++
 deploy/docker/api.py                    | 40 ++++++++++++--
 deploy/docker/crawler_pool.py           | 28 +++++++---
 deploy/docker/utils.py                  | 31 ++++++++++-
 docs/examples/docker/demo_docker_api.py | 69 +++++++++++++++++++++++++
 5 files changed, 160 insertions(+), 13 deletions(-)

diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py
index 6917f27e9..a6a1419af 100644
--- a/crawl4ai/__init__.py
+++ b/crawl4ai/__init__.py
@@ -10,6 +10,9 @@
     LXMLWebScrapingStrategy,
     WebScrapingStrategy,  # Backward compatibility alias
 )
+
+from .processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
+
 from .async_logger import (
     AsyncLoggerBase,
     AsyncLogger,
@@ -128,6 +131,8 @@
     "BFSDeepCrawlStrategy",
     "BestFirstCrawlingStrategy",
     "DFSDeepCrawlStrategy",
+    "PDFCrawlerStrategy",
+    "PDFContentScrapingStrategy",
     "FilterChain",
     "URLPatternFilter",
     "ContentTypeFilter",
diff --git a/deploy/docker/api.py b/deploy/docker/api.py
index 58d8c01fe..6b2b31468 100644
--- a/deploy/docker/api.py
+++ b/deploy/docker/api.py
@@ -13,8 +13,12 @@
 from fastapi import HTTPException, Request, status
 from fastapi.background import BackgroundTasks
 from fastapi.responses import JSONResponse
+from fastapi.encoders import jsonable_encoder
+
 from redis import asyncio as aioredis
 
+from utils import is_pdf_url
+
 from crawl4ai import (
     AsyncWebCrawler,
     CrawlerRunConfig,
@@ -31,6 +35,10 @@
     BM25ContentFilter,
     LLMContentFilter
 )
+
+from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
+from crawl4ai.async_configs import to_serializable_dict
+
 from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
 
@@ -431,6 +439,18 @@ async def handle_crawl_request(
         urls = [('https://' + url) if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")) else url for url in urls]
         browser_config = BrowserConfig.load(browser_config)
         crawler_config = CrawlerRunConfig.load(crawler_config)
+        
+        is_pdf_flags = await asyncio.gather(*(is_pdf_url(url) for url in urls))
+        is_pdf = any(is_pdf_flags)
+        crawler_strategy = PDFCrawlerStrategy() if is_pdf else None
+
+        if is_pdf and crawler_config.scraping_strategy is None:
+            # Default strategy if not set
+            crawler_config.scraping_strategy = PDFContentScrapingStrategy(
+                extract_images=False,
+                save_images_locally=False,
+                batch_size=2
+            )
 
         dispatcher = MemoryAdaptiveDispatcher(
             memory_threshold_percent=config["crawler"]["memory_threshold_percent"],
@@ -440,7 +460,7 @@ async def handle_crawl_request(
         )
         
         from crawler_pool import get_crawler
-        crawler = await get_crawler(browser_config)
+        crawler = await get_crawler(browser_config, crawler_strategy)
 
         # crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config)
         # await crawler.start()
@@ -476,7 +496,8 @@ async def handle_crawl_request(
             # If PDF exists, encode it to base64
             if result_dict.get('pdf') is not None:
                 result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
-            processed_results.append(result_dict)
+
+            processed_results.append(to_serializable_dict(result_dict))
             
         return {
             "success": True,
@@ -521,8 +542,19 @@ async def handle_stream_crawl_request(
         # browser_config.verbose = True # Set to False or remove for production stress testing
         browser_config.verbose = False
         crawler_config = CrawlerRunConfig.load(crawler_config)
-        crawler_config.scraping_strategy = LXMLWebScrapingStrategy()
         crawler_config.stream = True
+        
+        is_pdf_flags = await asyncio.gather(*(is_pdf_url(url) for url in urls))
+        is_pdf = any(is_pdf_flags)
+        crawler_strategy = PDFCrawlerStrategy() if is_pdf else None
+
+        if is_pdf and crawler_config.scraping_strategy is None:
+            # Default strategy if not set
+            crawler_config.scraping_strategy = PDFContentScrapingStrategy(
+                extract_images=True,
+                save_images_locally=False,
+                batch_size=2
+            )
 
         dispatcher = MemoryAdaptiveDispatcher(
             memory_threshold_percent=config["crawler"]["memory_threshold_percent"],
@@ -532,7 +564,7 @@ async def handle_stream_crawl_request(
         )
 
         from crawler_pool import get_crawler
-        crawler = await get_crawler(browser_config)
+        crawler = await get_crawler(browser_config, crawler_strategy)
 
         # crawler = AsyncWebCrawler(config=browser_config)
         # await crawler.start()
diff --git a/deploy/docker/crawler_pool.py b/deploy/docker/crawler_pool.py
index d15102e4d..82ba30b71 100644
--- a/deploy/docker/crawler_pool.py
+++ b/deploy/docker/crawler_pool.py
@@ -1,11 +1,11 @@
 # crawler_pool.py  (new file)
 import asyncio, json, hashlib, time, psutil
 from contextlib import suppress
-from typing import Dict
+from typing import Dict, Optional
 from crawl4ai import AsyncWebCrawler, BrowserConfig
-from typing import Dict
 from utils import load_config 
 
+
 CONFIG = load_config()
 
 POOL: Dict[str, AsyncWebCrawler] = {}
@@ -15,20 +15,31 @@
 MEM_LIMIT  = CONFIG.get("crawler", {}).get("memory_threshold_percent", 95.0)   # % RAM – refuse new browsers above this
 IDLE_TTL  = CONFIG.get("crawler", {}).get("pool", {}).get("idle_ttl_sec", 1800)   # close if unused for 30 min
 
-def _sig(cfg: BrowserConfig) -> str:
-    payload = json.dumps(cfg.to_dict(), sort_keys=True, separators=(",",":"))
-    return hashlib.sha1(payload.encode()).hexdigest()
+def _sig(cfg: BrowserConfig, crawler_strategy: Optional[object]  = None) -> str:
+    """
+    Generate a unique signature for a crawler based on browser config
+    and optional crawler strategy. This ensures that crawlers with
+    different strategies (e.g., PDF) are stored separately in the pool.
+    """
+    payload = cfg.to_dict()
+
+    if crawler_strategy is not None:
+        payload["strategy"] = crawler_strategy.__class__.__name__
+
+    json_payload = json.dumps(payload, sort_keys=True, separators=(",", ":"))
+    return hashlib.sha1(json_payload.encode()).hexdigest()
+
 
-async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
+async def get_crawler(cfg: BrowserConfig, crawler_strategy: Optional[object] = None) -> AsyncWebCrawler:
     try:
-        sig = _sig(cfg)
+        sig = _sig(cfg, crawler_strategy=crawler_strategy)
         async with LOCK:
             if sig in POOL:
                 LAST_USED[sig] = time.time();  
                 return POOL[sig]
             if psutil.virtual_memory().percent >= MEM_LIMIT:
                 raise MemoryError("RAM pressure – new browser denied")
-            crawler = AsyncWebCrawler(config=cfg, thread_safe=False)
+            crawler = AsyncWebCrawler(config=cfg, thread_safe=False, crawler_strategy=crawler_strategy)
             await crawler.start()
             POOL[sig] = crawler; LAST_USED[sig] = time.time()
             return crawler
@@ -44,6 +55,7 @@ async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
             POOL.pop(sig, None)
             LAST_USED.pop(sig, None)
         # If we failed to start the browser, we should remove it from the pool
+        
 async def close_all():
     async with LOCK:
         await asyncio.gather(*(c.close() for c in POOL.values()), return_exceptions=True)
diff --git a/deploy/docker/utils.py b/deploy/docker/utils.py
index 2e2a80ac7..7da18fda9 100644
--- a/deploy/docker/utils.py
+++ b/deploy/docker/utils.py
@@ -2,6 +2,7 @@
 import logging
 import yaml
 import os
+import httpx
 from datetime import datetime
 from enum import Enum
 from pathlib import Path
@@ -124,4 +125,32 @@ def verify_email_domain(email: str) -> bool:
         records = dns.resolver.resolve(domain, 'MX')
         return True if records else False
     except Exception as e:
-        return False
\ No newline at end of file
+        return False
+    
+async def is_pdf_url(url: str) -> bool:
+    """
+    Check if a URL points to a PDF using httpx:
+    - Check extension
+    - Check Content-Type via HEAD request
+    - Check first 5 bytes (magic number) if needed
+    """
+    if url.lower().endswith(".pdf"):
+        return True
+
+    try:
+        async with httpx.AsyncClient(follow_redirects=True) as client:
+            # HEAD request to check Content-Type
+            head_resp = await client.head(url)
+            content_type = head_resp.headers.get("content-type", "").lower()
+            if "application/pdf" in content_type:
+                return True
+
+            # Fallback: GET first 5 bytes to check PDF magic number
+            get_resp = await client.get(url, headers={"Range": "bytes=0-4"})
+            if get_resp.status_code in (200, 206):  # 206 Partial Content
+                return get_resp.content.startswith(b"%PDF-")
+    except Exception:
+        return False
+
+    return False
+
diff --git a/docs/examples/docker/demo_docker_api.py b/docs/examples/docker/demo_docker_api.py
index 0a3d51af1..f13e341e5 100644
--- a/docs/examples/docker/demo_docker_api.py
+++ b/docs/examples/docker/demo_docker_api.py
@@ -27,6 +27,7 @@
 FORMS_URL = "https://httpbin.org/forms/post"  # For JS demo
 BOOKS_URL = "http://books.toscrape.com/"  # For CSS extraction
 PYTHON_URL = "https://python.org"  # For deeper crawl
+PDF_URL = "https://arxiv.org/pdf/2310.06825" # For PDF demo
 # Use the same sample site as deep crawl tests for consistency
 DEEP_CRAWL_BASE_URL = os.getenv(
     "DEEP_CRAWL_TEST_SITE", "https://docs.crawl4ai.com/samples/deepcrawl/")
@@ -1261,6 +1262,71 @@ async def demo_config_dump_invalid(client: httpx.AsyncClient):
         console.print(
             f"[bold red]Unexpected error during invalid test:[/] {e}")
 
+# 10. Crawl PDF
+
+async def demo_pdf_crawl(client: httpx.AsyncClient):
+    payload = {
+        "urls": [PDF_URL],
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "cache_mode": "BYPASS",
+                "scraping_strategy": {
+                    "type": "PDFContentScrapingStrategy",
+                    "params": {
+                        "extract_images": False,
+                        "save_images_locally": False,
+                        "batch_size": 2
+                    }
+                }
+            }
+        }
+    }
+
+    resp = await client.post("/crawl", json=payload)
+    resp.raise_for_status()
+    data = resp.json()
+    print("=== Demo: PDF Crawl ===")
+    print("Success:", data.get("success"))
+    print("Number of results:", len(data.get("results", [])))
+    if data.get("results"):
+        first = data["results"][0]
+        text_snippet = (first.get("text") or "")[:500]
+        print("Extracted text (first 500 chars):")
+        print(text_snippet)
+        
+# 11. Crawl PDF stream
+
+async def demo_pdf_crawl_stream(client: httpx.AsyncClient):
+    """
+    Demo: Crawl a PDF and stream the extracted text content.
+    """
+    payload = {
+        "urls": [PDF_URL],
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "stream": True,
+                "cache_mode": "BYPASS",
+                "scraping_strategy": {  # <-- Default strategy if not set
+                    "type": "PDFContentScrapingStrategy",
+                    "params": {
+                        "extract_images": False,     
+                        "save_images_locally": False,
+                        "batch_size": 2              
+                    }
+                }
+            }
+        }
+    }
+
+    await stream_request(
+        client,
+        "/crawl/stream",
+        payload,
+        "Demo PDF: Streaming PDF Crawl"
+    )
+
 
 # --- Update Main Runner to include new demo ---
 async def main_demo():
@@ -1294,6 +1360,9 @@ async def main_demo():
         # await demo_deep_with_llm_extraction(client)
         # await demo_deep_with_proxy(client)  # Skips if no PROXIES env var
         # await demo_deep_with_ssl(client)   # Added the new demo
+        
+        # await demo_pdf_crawl_stream(client)
+        # await demo_pdf_crawl(client)
 
         # --- Helper endpoints ---
         await demo_markdown_endpoint(client)

From 057fb61cf0d27075bb298162a776acdcb07f4fa9 Mon Sep 17 00:00:00 2001
From: nicolas145 <nicolas@sofy.fr>
Date: Thu, 25 Sep 2025 11:37:47 -0400
Subject: [PATCH 02/21] fix: typo

---
 deploy/docker/api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deploy/docker/api.py b/deploy/docker/api.py
index 6b2b31468..9e88e28f7 100644
--- a/deploy/docker/api.py
+++ b/deploy/docker/api.py
@@ -445,7 +445,7 @@ async def handle_crawl_request(
         crawler_strategy = PDFCrawlerStrategy() if is_pdf else None
 
         if is_pdf and crawler_config.scraping_strategy is None:
-            # Default strategy if not set
+            # Default strategy if not set.
             crawler_config.scraping_strategy = PDFContentScrapingStrategy(
                 extract_images=False,
                 save_images_locally=False,

From 5adc9dce066c481884b2c783881ce47ff72bb369 Mon Sep 17 00:00:00 2001
From: nicolas145 <nicolas@sofy.fr>
Date: Thu, 25 Sep 2025 12:18:47 -0400
Subject: [PATCH 03/21] fix: add test suits

---
 tests/docker/test_rest_api_pdf_crawl.py | 229 ++++++++++++++++++++++++
 1 file changed, 229 insertions(+)
 create mode 100644 tests/docker/test_rest_api_pdf_crawl.py

diff --git a/tests/docker/test_rest_api_pdf_crawl.py b/tests/docker/test_rest_api_pdf_crawl.py
new file mode 100644
index 000000000..20b4bd008
--- /dev/null
+++ b/tests/docker/test_rest_api_pdf_crawl.py
@@ -0,0 +1,229 @@
+# ==== File: test_rest_api_deep_crawl.py ====
+
+import pytest
+import pytest_asyncio
+import httpx
+import json
+import asyncio
+import os
+from typing import List, Dict, Any, AsyncGenerator
+
+from dotenv import load_dotenv
+load_dotenv() # Load environment variables from .env file if present
+
+# --- Test Configuration ---
+BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # If server is running in Docker, use the host's IP
+BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # If server is running in dev debug mode
+PDF_TEST_URL = "https://arxiv.org/pdf/2310.06825"
+PDF_TEST_INVALID_URL = "https://docs.crawl4ai.com/samples/deepcrawl/"
+
+# --- Helper Functions ---
+def load_proxies_from_env() -> List[Dict]:
+    """Load proxies from PROXIES environment variable"""
+    proxies = []
+    proxies_str = os.getenv("PROXIES", "")
+    if not proxies_str:
+        print("PROXIES environment variable not set or empty.")
+        return proxies
+    try:
+        proxy_list = proxies_str.split(",")
+        for proxy in proxy_list:
+            proxy = proxy.strip()
+            if not proxy:
+                continue
+            parts = proxy.split(":")
+            if len(parts) == 4:
+                ip, port, username, password = parts
+                proxies.append({
+                    "server": f"http://{ip}:{port}", # Assuming http, adjust if needed
+                    "username": username,
+                    "password": password,
+                    "ip": ip  # Store original IP if available
+                })
+            elif len(parts) == 2: # ip:port only
+                 ip, port = parts
+                 proxies.append({
+                    "server": f"http://{ip}:{port}",
+                    "ip": ip
+                 })
+            else:
+                 print(f"Skipping invalid proxy string format: {proxy}")
+
+    except Exception as e:
+        print(f"Error loading proxies from environment: {e}")
+    return proxies
+
+
+async def check_server_health(client: httpx.AsyncClient):
+    """Check if the server is healthy before running tests."""
+    try:
+        response = await client.get("/health")
+        response.raise_for_status()
+        print(f"\nServer healthy: {response.json()}")
+        return True
+    except (httpx.RequestError, httpx.HTTPStatusError) as e:
+        pytest.fail(f"Server health check failed: {e}. Is the server running at {BASE_URL}?", pytrace=False)
+
+async def assert_crawl_result_structure(result: Dict[str, Any], check_ssl=False):
+    """Asserts the basic structure of a single crawl result."""
+    assert isinstance(result, dict)
+    assert "url" in result
+    assert "success" in result
+    assert "html" in result # Basic crawls should return HTML
+    assert "metadata" in result
+    assert isinstance(result["metadata"], dict)
+    assert "depth" in result["metadata"] # Deep crawls add depth
+
+    if check_ssl:
+        assert "ssl_certificate" in result # Check if SSL info is present
+        assert isinstance(result["ssl_certificate"], dict) or result["ssl_certificate"] is None
+
+
+async def process_streaming_response(response: httpx.Response) -> List[Dict[str, Any]]:
+    """Processes an NDJSON streaming response."""
+    results = []
+    completed = False
+    async for line in response.aiter_lines():
+        if line:
+            try:
+                data = json.loads(line)
+                if data.get("status") == "completed":
+                    completed = True
+                    break # Stop processing after completion marker
+                elif data.get("url"): # Ensure it looks like a result object
+                    results.append(data)
+                else:
+                    print(f"Received non-result JSON line: {data}") # Log other status messages if needed
+            except json.JSONDecodeError:
+                pytest.fail(f"Failed to decode JSON line: {line}")
+    assert completed, "Streaming response did not end with a completion marker."
+    return results
+
+
+# --- Pytest Fixtures ---
+@pytest_asyncio.fixture(scope="function")
+async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]:
+    """Provides an async HTTP client"""
+    # Increased timeout for potentially longer deep crawls
+    async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client:
+        yield client
+    # No explicit close needed with 'async with'
+
+# --- Test Class for PDF Scraping ---
+@pytest.mark.asyncio
+class TestPdfScraping:
+
+    @pytest_asyncio.fixture(autouse=True)
+    async def check_health_before_tests(self, async_client: httpx.AsyncClient):
+        """Fixture to ensure server is healthy before each test in the class."""
+        await check_server_health(async_client)
+
+    async def test_pdf_scraping_basic(self, async_client: httpx.AsyncClient):
+        """Test basic PDF scraping for a single PDF URL."""
+        payload = {
+            "urls": [PDF_TEST_URL],  # URL of a test PDF
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "stream": False,
+                    "cache_mode": "BYPASS",
+                    "scraping_strategy": {
+                        "type": "PdfScrapingStrategy",  # Custom PDF scraping strategy
+                        "params": {}
+                    },
+                    "deep_crawl_strategy": {
+                        "type": "BFSDeepCrawlStrategy",
+                        "params": {"max_depth": 0, "max_pages": 1}
+                    }
+                }
+            }
+        }
+
+        response = await async_client.post("/crawl", json=payload)
+        response.raise_for_status()
+        data = response.json()
+
+        assert data["success"] is True
+        assert len(data["results"]) == 1
+
+        result = data["results"][0]
+        await assert_crawl_result_structure(result)
+        assert result["success"] is True
+        assert "extracted_content" in result
+        assert result["extracted_content"] is not None
+        # Vérifier que le texte extrait est non vide
+        extracted_text = result["extracted_content"].get("text", "")
+        assert isinstance(extracted_text, str)
+        assert len(extracted_text) > 0
+
+    async def test_pdf_scraping_with_metadata(self, async_client: httpx.AsyncClient):
+        """Test PDF scraping with metadata extraction."""
+        payload = {
+            "urls": [PDF_TEST_URL],
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "stream": False,
+                    "cache_mode": "BYPASS",
+                    "scraping_strategy": {
+                        "type": "PdfScrapingStrategy",
+                        "params": {"extract_metadata": True}  # Param spécifique pour métadonnées
+                    },
+                    "deep_crawl_strategy": {
+                        "type": "BFSDeepCrawlStrategy",
+                        "params": {"max_depth": 0, "max_pages": 1}
+                    }
+                }
+            }
+        }
+
+        response = await async_client.post("/crawl", json=payload)
+        response.raise_for_status()
+        data = response.json()
+
+        assert data["success"] is True
+        result = data["results"][0]
+        assert "extracted_content" in result
+        metadata = result["extracted_content"].get("metadata", {})
+        assert isinstance(metadata, dict)
+        # Vérification simple : titre et auteur peuvent exister
+        assert "title" in metadata or "author" in metadata
+
+    async def test_pdf_scraping_non_accessible(self, async_client: httpx.AsyncClient):
+        """Test PDF scraping when PDF is not accessible."""
+        payload = {
+            "urls": [PDF_TEST_INVALID_URL],  # URL invalide
+            "crawler_config": {
+                "type": "CrawlerRunConfig",
+                "params": {
+                    "stream": False,
+                    "cache_mode": "BYPASS",
+                    "scraping_strategy": {
+                        "type": "PdfScrapingStrategy",
+                        "params": {}
+                    },
+                    "deep_crawl_strategy": {
+                        "type": "BFSDeepCrawlStrategy",
+                        "params": {"max_depth": 0, "max_pages": 1}
+                    }
+                }
+            }
+        }
+
+        response = await async_client.post("/crawl", json=payload)
+        # Le serveur doit répondre OK mais le résultat doit indiquer échec
+        data = response.json()
+        assert data["success"] is True
+        result = data["results"][0]
+        assert result["success"] is False
+        assert "extracted_content" not in result or result["extracted_content"] is None
+
+
+# --- Main Execution Block (for running script directly) ---
+if __name__ == "__main__":
+    pytest_args = ["-v", "-s", __file__]
+    # Example: Run only proxy test
+    # pytest_args.append("-k test_deep_crawl_with_proxies")
+    print(f"Running pytest with args: {pytest_args}")
+    exit_code = pytest.main(pytest_args)
+    print(f"Pytest finished with exit code: {exit_code}")
\ No newline at end of file

From d231d618b1399143e6a4005018e8cb797f5b56a5 Mon Sep 17 00:00:00 2001
From: nicolas145 <nicolas@sofy.fr>
Date: Thu, 25 Sep 2025 12:23:43 -0400
Subject: [PATCH 04/21] fix: typo

---
 tests/docker/test_rest_api_pdf_crawl.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/docker/test_rest_api_pdf_crawl.py b/tests/docker/test_rest_api_pdf_crawl.py
index 20b4bd008..8549ac622 100644
--- a/tests/docker/test_rest_api_pdf_crawl.py
+++ b/tests/docker/test_rest_api_pdf_crawl.py
@@ -1,4 +1,4 @@
-# ==== File: test_rest_api_deep_crawl.py ====
+# ==== File: test_rest_api_pdf_crawl.py ====
 
 import pytest
 import pytest_asyncio
@@ -151,7 +151,7 @@ async def test_pdf_scraping_basic(self, async_client: httpx.AsyncClient):
         assert result["success"] is True
         assert "extracted_content" in result
         assert result["extracted_content"] is not None
-        # Vérifier que le texte extrait est non vide
+
         extracted_text = result["extracted_content"].get("text", "")
         assert isinstance(extracted_text, str)
         assert len(extracted_text) > 0
@@ -167,7 +167,7 @@ async def test_pdf_scraping_with_metadata(self, async_client: httpx.AsyncClient)
                     "cache_mode": "BYPASS",
                     "scraping_strategy": {
                         "type": "PdfScrapingStrategy",
-                        "params": {"extract_metadata": True}  # Param spécifique pour métadonnées
+                        "params": {"extract_metadata": True}
                     },
                     "deep_crawl_strategy": {
                         "type": "BFSDeepCrawlStrategy",
@@ -186,13 +186,13 @@ async def test_pdf_scraping_with_metadata(self, async_client: httpx.AsyncClient)
         assert "extracted_content" in result
         metadata = result["extracted_content"].get("metadata", {})
         assert isinstance(metadata, dict)
-        # Vérification simple : titre et auteur peuvent exister
+
         assert "title" in metadata or "author" in metadata
 
     async def test_pdf_scraping_non_accessible(self, async_client: httpx.AsyncClient):
         """Test PDF scraping when PDF is not accessible."""
         payload = {
-            "urls": [PDF_TEST_INVALID_URL],  # URL invalide
+            "urls": [PDF_TEST_INVALID_URL],
             "crawler_config": {
                 "type": "CrawlerRunConfig",
                 "params": {
@@ -211,7 +211,7 @@ async def test_pdf_scraping_non_accessible(self, async_client: httpx.AsyncClient
         }
 
         response = await async_client.post("/crawl", json=payload)
-        # Le serveur doit répondre OK mais le résultat doit indiquer échec
+
         data = response.json()
         assert data["success"] is True
         result = data["results"][0]

From c709082ae0d01f8dde54bf55ffd21d65c24036a1 Mon Sep 17 00:00:00 2001
From: nicolas145 <nicolas@sofy.fr>
Date: Thu, 25 Sep 2025 12:28:00 -0400
Subject: [PATCH 05/21] fix: remove unused code

---
 tests/docker/test_rest_api_pdf_crawl.py | 56 -------------------------
 1 file changed, 56 deletions(-)

diff --git a/tests/docker/test_rest_api_pdf_crawl.py b/tests/docker/test_rest_api_pdf_crawl.py
index 8549ac622..fd029f12c 100644
--- a/tests/docker/test_rest_api_pdf_crawl.py
+++ b/tests/docker/test_rest_api_pdf_crawl.py
@@ -18,41 +18,6 @@
 PDF_TEST_INVALID_URL = "https://docs.crawl4ai.com/samples/deepcrawl/"
 
 # --- Helper Functions ---
-def load_proxies_from_env() -> List[Dict]:
-    """Load proxies from PROXIES environment variable"""
-    proxies = []
-    proxies_str = os.getenv("PROXIES", "")
-    if not proxies_str:
-        print("PROXIES environment variable not set or empty.")
-        return proxies
-    try:
-        proxy_list = proxies_str.split(",")
-        for proxy in proxy_list:
-            proxy = proxy.strip()
-            if not proxy:
-                continue
-            parts = proxy.split(":")
-            if len(parts) == 4:
-                ip, port, username, password = parts
-                proxies.append({
-                    "server": f"http://{ip}:{port}", # Assuming http, adjust if needed
-                    "username": username,
-                    "password": password,
-                    "ip": ip  # Store original IP if available
-                })
-            elif len(parts) == 2: # ip:port only
-                 ip, port = parts
-                 proxies.append({
-                    "server": f"http://{ip}:{port}",
-                    "ip": ip
-                 })
-            else:
-                 print(f"Skipping invalid proxy string format: {proxy}")
-
-    except Exception as e:
-        print(f"Error loading proxies from environment: {e}")
-    return proxies
-
 
 async def check_server_health(client: httpx.AsyncClient):
     """Check if the server is healthy before running tests."""
@@ -79,27 +44,6 @@ async def assert_crawl_result_structure(result: Dict[str, Any], check_ssl=False)
         assert isinstance(result["ssl_certificate"], dict) or result["ssl_certificate"] is None
 
 
-async def process_streaming_response(response: httpx.Response) -> List[Dict[str, Any]]:
-    """Processes an NDJSON streaming response."""
-    results = []
-    completed = False
-    async for line in response.aiter_lines():
-        if line:
-            try:
-                data = json.loads(line)
-                if data.get("status") == "completed":
-                    completed = True
-                    break # Stop processing after completion marker
-                elif data.get("url"): # Ensure it looks like a result object
-                    results.append(data)
-                else:
-                    print(f"Received non-result JSON line: {data}") # Log other status messages if needed
-            except json.JSONDecodeError:
-                pytest.fail(f"Failed to decode JSON line: {line}")
-    assert completed, "Streaming response did not end with a completion marker."
-    return results
-
-
 # --- Pytest Fixtures ---
 @pytest_asyncio.fixture(scope="function")
 async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]:

From 481e7fe21fb7f4c7dfdab611e22b79608aa65a11 Mon Sep 17 00:00:00 2001
From: nicolas145 <nicolas@sofy.fr>
Date: Thu, 25 Sep 2025 12:36:07 -0400
Subject: [PATCH 06/21] fix: remove broken test

---
 tests/docker/test_rest_api_pdf_crawl.py | 37 ++-----------------------
 1 file changed, 2 insertions(+), 35 deletions(-)

diff --git a/tests/docker/test_rest_api_pdf_crawl.py b/tests/docker/test_rest_api_pdf_crawl.py
index fd029f12c..94328b20c 100644
--- a/tests/docker/test_rest_api_pdf_crawl.py
+++ b/tests/docker/test_rest_api_pdf_crawl.py
@@ -72,7 +72,7 @@ async def test_pdf_scraping_basic(self, async_client: httpx.AsyncClient):
                     "stream": False,
                     "cache_mode": "BYPASS",
                     "scraping_strategy": {
-                        "type": "PdfScrapingStrategy",  # Custom PDF scraping strategy
+                        "type": "PDFContentScrapingStrategy",  # Custom PDF scraping strategy
                         "params": {}
                     },
                     "deep_crawl_strategy": {
@@ -100,39 +100,6 @@ async def test_pdf_scraping_basic(self, async_client: httpx.AsyncClient):
         assert isinstance(extracted_text, str)
         assert len(extracted_text) > 0
 
-    async def test_pdf_scraping_with_metadata(self, async_client: httpx.AsyncClient):
-        """Test PDF scraping with metadata extraction."""
-        payload = {
-            "urls": [PDF_TEST_URL],
-            "crawler_config": {
-                "type": "CrawlerRunConfig",
-                "params": {
-                    "stream": False,
-                    "cache_mode": "BYPASS",
-                    "scraping_strategy": {
-                        "type": "PdfScrapingStrategy",
-                        "params": {"extract_metadata": True}
-                    },
-                    "deep_crawl_strategy": {
-                        "type": "BFSDeepCrawlStrategy",
-                        "params": {"max_depth": 0, "max_pages": 1}
-                    }
-                }
-            }
-        }
-
-        response = await async_client.post("/crawl", json=payload)
-        response.raise_for_status()
-        data = response.json()
-
-        assert data["success"] is True
-        result = data["results"][0]
-        assert "extracted_content" in result
-        metadata = result["extracted_content"].get("metadata", {})
-        assert isinstance(metadata, dict)
-
-        assert "title" in metadata or "author" in metadata
-
     async def test_pdf_scraping_non_accessible(self, async_client: httpx.AsyncClient):
         """Test PDF scraping when PDF is not accessible."""
         payload = {
@@ -143,7 +110,7 @@ async def test_pdf_scraping_non_accessible(self, async_client: httpx.AsyncClient
                     "stream": False,
                     "cache_mode": "BYPASS",
                     "scraping_strategy": {
-                        "type": "PdfScrapingStrategy",
+                        "type": "PDFContentScrapingStrategy",
                         "params": {}
                     },
                     "deep_crawl_strategy": {

From 726f41ada6409526f5af23194e151978bac16257 Mon Sep 17 00:00:00 2001
From: nicolas145 <nicolas@sofy.fr>
Date: Thu, 25 Sep 2025 12:37:44 -0400
Subject: [PATCH 07/21] fix:remove deep crawl form tests

---
 tests/docker/test_rest_api_pdf_crawl.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/tests/docker/test_rest_api_pdf_crawl.py b/tests/docker/test_rest_api_pdf_crawl.py
index 94328b20c..5710fde34 100644
--- a/tests/docker/test_rest_api_pdf_crawl.py
+++ b/tests/docker/test_rest_api_pdf_crawl.py
@@ -75,10 +75,6 @@ async def test_pdf_scraping_basic(self, async_client: httpx.AsyncClient):
                         "type": "PDFContentScrapingStrategy",  # Custom PDF scraping strategy
                         "params": {}
                     },
-                    "deep_crawl_strategy": {
-                        "type": "BFSDeepCrawlStrategy",
-                        "params": {"max_depth": 0, "max_pages": 1}
-                    }
                 }
             }
         }
@@ -113,10 +109,6 @@ async def test_pdf_scraping_non_accessible(self, async_client: httpx.AsyncClient
                         "type": "PDFContentScrapingStrategy",
                         "params": {}
                     },
-                    "deep_crawl_strategy": {
-                        "type": "BFSDeepCrawlStrategy",
-                        "params": {"max_depth": 0, "max_pages": 1}
-                    }
                 }
             }
         }

From 47bd3928006f510762486cde526a5052112e96ab Mon Sep 17 00:00:00 2001
From: nicolas145 <nicolas@sofy.fr>
Date: Thu, 25 Sep 2025 12:48:07 -0400
Subject: [PATCH 08/21] fix: coderabbit recommendations

---
 deploy/docker/api.py                    | 12 ++++++++++++
 deploy/docker/crawler_pool.py           |  8 +++++---
 deploy/docker/utils.py                  | 20 +++++++++++---------
 tests/docker/test_rest_api_pdf_crawl.py |  6 ++++--
 4 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/deploy/docker/api.py b/deploy/docker/api.py
index 9e88e28f7..88c9883b5 100644
--- a/deploy/docker/api.py
+++ b/deploy/docker/api.py
@@ -442,6 +442,12 @@ async def handle_crawl_request(
         
         is_pdf_flags = await asyncio.gather(*(is_pdf_url(url) for url in urls))
         is_pdf = any(is_pdf_flags)
+        if any(is_pdf_flags) and not all(is_pdf_flags):
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail="Mix of PDF and non-PDF URLs in a single request is not supported yet."
+            )
+            
         crawler_strategy = PDFCrawlerStrategy() if is_pdf else None
 
         if is_pdf and crawler_config.scraping_strategy is None:
@@ -546,6 +552,12 @@ async def handle_stream_crawl_request(
         
         is_pdf_flags = await asyncio.gather(*(is_pdf_url(url) for url in urls))
         is_pdf = any(is_pdf_flags)
+        if any(is_pdf_flags) and not all(is_pdf_flags):
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail="Mix of PDF and non-PDF URLs in a single request is not supported yet."
+            )
+            
         crawler_strategy = PDFCrawlerStrategy() if is_pdf else None
 
         if is_pdf and crawler_config.scraping_strategy is None:
diff --git a/deploy/docker/crawler_pool.py b/deploy/docker/crawler_pool.py
index 82ba30b71..efc7076fc 100644
--- a/deploy/docker/crawler_pool.py
+++ b/deploy/docker/crawler_pool.py
@@ -31,6 +31,7 @@ def _sig(cfg: BrowserConfig, crawler_strategy: Optional[object]  = None) -> str:
 
 
 async def get_crawler(cfg: BrowserConfig, crawler_strategy: Optional[object] = None) -> AsyncWebCrawler:
+    sig: Optional[str] = None
     try:
         sig = _sig(cfg, crawler_strategy=crawler_strategy)
         async with LOCK:
@@ -48,12 +49,13 @@ async def get_crawler(cfg: BrowserConfig, crawler_strategy: Optional[object] = N
     except Exception as e:
         raise RuntimeError(f"Failed to start browser: {e}")
     finally:
-        if sig in POOL:
+        if sig and sig in POOL:
             LAST_USED[sig] = time.time()
         else:
             # If we failed to start the browser, we should remove it from the pool
-            POOL.pop(sig, None)
-            LAST_USED.pop(sig, None)
+            if sig:
+                POOL.pop(sig, None)
+                LAST_USED.pop(sig, None)
         # If we failed to start the browser, we should remove it from the pool
         
 async def close_all():
diff --git a/deploy/docker/utils.py b/deploy/docker/utils.py
index 7da18fda9..da3a688f3 100644
--- a/deploy/docker/utils.py
+++ b/deploy/docker/utils.py
@@ -137,20 +137,22 @@ async def is_pdf_url(url: str) -> bool:
     if url.lower().endswith(".pdf"):
         return True
 
-    try:
-        async with httpx.AsyncClient(follow_redirects=True) as client:
-            # HEAD request to check Content-Type
-            head_resp = await client.head(url)
+    timeout = httpx.Timeout(connect=5.0, read=10.0, write=5.0)
+    async with httpx.AsyncClient(follow_redirects=True, timeout=timeout) as client:
+        # HEAD request to check Content-Type (ignore servers that reject HEAD)
+        try:
+            head_resp = await client.head(url, headers={"Accept": "*/*"})
             content_type = head_resp.headers.get("content-type", "").lower()
             if "application/pdf" in content_type:
                 return True
+        except httpx.HTTPError:
+            pass
 
-            # Fallback: GET first 5 bytes to check PDF magic number
+        # Fallback: GET first 5 bytes to check PDF magic number
+        try:
             get_resp = await client.get(url, headers={"Range": "bytes=0-4"})
             if get_resp.status_code in (200, 206):  # 206 Partial Content
                 return get_resp.content.startswith(b"%PDF-")
-    except Exception:
-        return False
-
-    return False
+        except httpx.HTTPError:
+            return False
 
diff --git a/tests/docker/test_rest_api_pdf_crawl.py b/tests/docker/test_rest_api_pdf_crawl.py
index 5710fde34..63bf77154 100644
--- a/tests/docker/test_rest_api_pdf_crawl.py
+++ b/tests/docker/test_rest_api_pdf_crawl.py
@@ -12,8 +12,10 @@
 load_dotenv() # Load environment variables from .env file if present
 
 # --- Test Configuration ---
-BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # If server is running in Docker, use the host's IP
-BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # If server is running in dev debug mode
+BASE_URL = os.getenv(
+    "CRAWL4AI_TEST_URL",
+    "http://localhost:11235",  # Docker default; override via env for dev/debug (e.g., 8020)
+)
 PDF_TEST_URL = "https://arxiv.org/pdf/2310.06825"
 PDF_TEST_INVALID_URL = "https://docs.crawl4ai.com/samples/deepcrawl/"
 

From e17484f5769d56675671064c7037d8e771ff00d6 Mon Sep 17 00:00:00 2001
From: nicolas145 <nicolas@sofy.fr>
Date: Fri, 26 Sep 2025 09:07:16 -0400
Subject: [PATCH 09/21] test: avoid overriding url

---
 docs/examples/docker/demo_docker_api.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/examples/docker/demo_docker_api.py b/docs/examples/docker/demo_docker_api.py
index f13e341e5..60b152f52 100644
--- a/docs/examples/docker/demo_docker_api.py
+++ b/docs/examples/docker/demo_docker_api.py
@@ -18,10 +18,8 @@
 console = Console()
 
 # --- Configuration ---
-BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020")
 BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235")
 # Target URLs
-SIMPLE_URL = "https://example.com"  # For demo purposes
 SIMPLE_URL = "https://httpbin.org/html"
 LINKS_URL = "https://httpbin.org/links/10/0"
 FORMS_URL = "https://httpbin.org/forms/post"  # For JS demo

From 94de653c9164eebca11cc5ed5c872881dba2671c Mon Sep 17 00:00:00 2001
From: nicolas145 <nicolas@sofy.fr>
Date: Fri, 26 Sep 2025 09:39:13 -0400
Subject: [PATCH 10/21] fix: security & lint

---
 deploy/docker/crawler_pool.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/deploy/docker/crawler_pool.py b/deploy/docker/crawler_pool.py
index efc7076fc..2af256563 100644
--- a/deploy/docker/crawler_pool.py
+++ b/deploy/docker/crawler_pool.py
@@ -27,7 +27,7 @@ def _sig(cfg: BrowserConfig, crawler_strategy: Optional[object]  = None) -> str:
         payload["strategy"] = crawler_strategy.__class__.__name__
 
     json_payload = json.dumps(payload, sort_keys=True, separators=(",", ":"))
-    return hashlib.sha1(json_payload.encode()).hexdigest()
+    return hashlib.sha256(json_payload.encode()).hexdigest()
 
 
 async def get_crawler(cfg: BrowserConfig, crawler_strategy: Optional[object] = None) -> AsyncWebCrawler:
@@ -36,16 +36,16 @@ async def get_crawler(cfg: BrowserConfig, crawler_strategy: Optional[object] = N
         sig = _sig(cfg, crawler_strategy=crawler_strategy)
         async with LOCK:
             if sig in POOL:
-                LAST_USED[sig] = time.time();  
+                LAST_USED[sig] = time.time()
                 return POOL[sig]
             if psutil.virtual_memory().percent >= MEM_LIMIT:
-                raise MemoryError("RAM pressure – new browser denied")
+                raise MemoryError("RAM pressure - new browser denied")
             crawler = AsyncWebCrawler(config=cfg, thread_safe=False, crawler_strategy=crawler_strategy)
             await crawler.start()
             POOL[sig] = crawler; LAST_USED[sig] = time.time()
             return crawler
     except MemoryError as e:
-        raise MemoryError(f"RAM pressure – new browser denied: {e}")
+        raise
     except Exception as e:
         raise RuntimeError(f"Failed to start browser: {e}")
     finally:

From bc3b3d14ac7a0cf9879b2c389afc6d4361f6e38a Mon Sep 17 00:00:00 2001
From: nicolas145 <nicolas@sofy.fr>
Date: Fri, 26 Sep 2025 09:39:52 -0400
Subject: [PATCH 11/21] fix: use safe callback

---
 docs/examples/docker/demo_docker_api.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/docs/examples/docker/demo_docker_api.py b/docs/examples/docker/demo_docker_api.py
index 60b152f52..a098fc4c9 100644
--- a/docs/examples/docker/demo_docker_api.py
+++ b/docs/examples/docker/demo_docker_api.py
@@ -25,7 +25,7 @@
 FORMS_URL = "https://httpbin.org/forms/post"  # For JS demo
 BOOKS_URL = "http://books.toscrape.com/"  # For CSS extraction
 PYTHON_URL = "https://python.org"  # For deeper crawl
-PDF_URL = "https://arxiv.org/pdf/2310.06825" # For PDF demo
+PDF_URL = "https://arxiv.org/pdf/2310.06825.pdf" # For PDF demo
 # Use the same sample site as deep crawl tests for consistency
 DEEP_CRAWL_BASE_URL = os.getenv(
     "DEEP_CRAWL_TEST_SITE", "https://docs.crawl4ai.com/samples/deepcrawl/")
@@ -1289,9 +1289,12 @@ async def demo_pdf_crawl(client: httpx.AsyncClient):
     print("Number of results:", len(data.get("results", [])))
     if data.get("results"):
         first = data["results"][0]
-        text_snippet = (first.get("text") or "")[:500]
+        text = first.get("extracted_content") or first.get("text") or ""
+        if isinstance(text, dict):
+            text = text.get("text") or ""
         print("Extracted text (first 500 chars):")
-        print(text_snippet)
+        print((text or "")[:500])
+
         
 # 11. Crawl PDF stream
 
@@ -1306,7 +1309,7 @@ async def demo_pdf_crawl_stream(client: httpx.AsyncClient):
             "params": {
                 "stream": True,
                 "cache_mode": "BYPASS",
-                "scraping_strategy": {  # <-- Default strategy if not set
+                "scraping_strategy": {
                     "type": "PDFContentScrapingStrategy",
                     "params": {
                         "extract_images": False,     

From 44b280b1c472c78cd8327a26510b81b2b4cc4263 Mon Sep 17 00:00:00 2001
From: nicolas145 <nicolas@sofy.fr>
Date: Fri, 26 Sep 2025 09:41:37 -0400
Subject: [PATCH 12/21] fix: guards and consistency

---
 tests/docker/test_rest_api_pdf_crawl.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/docker/test_rest_api_pdf_crawl.py b/tests/docker/test_rest_api_pdf_crawl.py
index 63bf77154..6068a990b 100644
--- a/tests/docker/test_rest_api_pdf_crawl.py
+++ b/tests/docker/test_rest_api_pdf_crawl.py
@@ -16,7 +16,7 @@
     "CRAWL4AI_TEST_URL",
     "http://localhost:11235",  # Docker default; override via env for dev/debug (e.g., 8020)
 )
-PDF_TEST_URL = "https://arxiv.org/pdf/2310.06825"
+PDF_TEST_URL = "https://arxiv.org/pdf/2310.06825.pdf"
 PDF_TEST_INVALID_URL = "https://docs.crawl4ai.com/samples/deepcrawl/"
 
 # --- Helper Functions ---
@@ -94,9 +94,9 @@ async def test_pdf_scraping_basic(self, async_client: httpx.AsyncClient):
         assert "extracted_content" in result
         assert result["extracted_content"] is not None
 
-        extracted_text = result["extracted_content"].get("text", "")
-        assert isinstance(extracted_text, str)
-        assert len(extracted_text) > 0
+        content = result.get("extracted_content")
+        extracted_text = content.get("text", "") if isinstance(content, dict) else (content or "")
+        assert isinstance(extracted_text, str) and len(extracted_text) > 0
 
     async def test_pdf_scraping_non_accessible(self, async_client: httpx.AsyncClient):
         """Test PDF scraping when PDF is not accessible."""

From 21f80fffa5dccc259e335f04b476b9f1f457891a Mon Sep 17 00:00:00 2001
From: nicolas145 <nicolas@sofy.fr>
Date: Fri, 26 Sep 2025 09:42:51 -0400
Subject: [PATCH 13/21] fix: use jsonable_encoder

---
 deploy/docker/api.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/deploy/docker/api.py b/deploy/docker/api.py
index 88c9883b5..4910bee18 100644
--- a/deploy/docker/api.py
+++ b/deploy/docker/api.py
@@ -37,7 +37,7 @@
 )
 
 from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
-from crawl4ai.async_configs import to_serializable_dict
+# from crawl4ai.async_configs import to_serializable_dict
 
 from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
@@ -451,7 +451,6 @@ async def handle_crawl_request(
         crawler_strategy = PDFCrawlerStrategy() if is_pdf else None
 
         if is_pdf and crawler_config.scraping_strategy is None:
-            # Default strategy if not set.
             crawler_config.scraping_strategy = PDFContentScrapingStrategy(
                 extract_images=False,
                 save_images_locally=False,
@@ -484,6 +483,7 @@ async def handle_crawl_request(
                                 config=crawler_config, 
                                 dispatcher=dispatcher)
         results = await partial_func()
+        results_list = results if isinstance(results, list) else [results]
 
         # await crawler.close()
         
@@ -497,13 +497,14 @@ async def handle_crawl_request(
 
         # Process results to handle PDF bytes
         processed_results = []
-        for result in results:
+        for result in results_list:
             result_dict = result.model_dump()
             # If PDF exists, encode it to base64
             if result_dict.get('pdf') is not None:
                 result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
 
-            processed_results.append(to_serializable_dict(result_dict))
+            # Keep response shape consistent with streaming (plain JSON-serializable dict)
+            processed_results.append(jsonable_encoder(result_dict))
             
         return {
             "success": True,
@@ -550,6 +551,9 @@ async def handle_stream_crawl_request(
         crawler_config = CrawlerRunConfig.load(crawler_config)
         crawler_config.stream = True
         
+        # Normalize URLs to include scheme (match non-streaming behavior)
+        urls = [('https://' + url) if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")) else url for url in urls]
+        
         is_pdf_flags = await asyncio.gather(*(is_pdf_url(url) for url in urls))
         is_pdf = any(is_pdf_flags)
         if any(is_pdf_flags) and not all(is_pdf_flags):
@@ -561,9 +565,8 @@ async def handle_stream_crawl_request(
         crawler_strategy = PDFCrawlerStrategy() if is_pdf else None
 
         if is_pdf and crawler_config.scraping_strategy is None:
-            # Default strategy if not set
             crawler_config.scraping_strategy = PDFContentScrapingStrategy(
-                extract_images=True,
+                extract_images=False,
                 save_images_locally=False,
                 batch_size=2
             )
@@ -572,7 +575,7 @@ async def handle_stream_crawl_request(
             memory_threshold_percent=config["crawler"]["memory_threshold_percent"],
             rate_limiter=RateLimiter(
                 base_delay=tuple(config["crawler"]["rate_limiter"]["base_delay"])
-            )
+            ) if config["crawler"]["rate_limiter"]["enabled"] else None
         )
 
         from crawler_pool import get_crawler

From 16363836bb75be5218160a0b497c1c12044ac85f Mon Sep 17 00:00:00 2001
From: nicolas145 <nicolas@sofy.fr>
Date: Fri, 26 Sep 2025 10:03:12 -0400
Subject: [PATCH 14/21] fix: timeout params

---
 deploy/docker/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deploy/docker/utils.py b/deploy/docker/utils.py
index da3a688f3..7cc487b35 100644
--- a/deploy/docker/utils.py
+++ b/deploy/docker/utils.py
@@ -137,7 +137,7 @@ async def is_pdf_url(url: str) -> bool:
     if url.lower().endswith(".pdf"):
         return True
 
-    timeout = httpx.Timeout(connect=5.0, read=10.0, write=5.0)
+    timeout = httpx.Timeout(5.0)
     async with httpx.AsyncClient(follow_redirects=True, timeout=timeout) as client:
         # HEAD request to check Content-Type (ignore servers that reject HEAD)
         try:

From 056721a89ae54ff8a438820de8966b2fbed29902 Mon Sep 17 00:00:00 2001
From: nicolas145 <nicolas@sofy.fr>
Date: Fri, 26 Sep 2025 10:34:01 -0400
Subject: [PATCH 15/21] fix: http params

---
 deploy/docker/utils.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/deploy/docker/utils.py b/deploy/docker/utils.py
index 7cc487b35..cf072b89a 100644
--- a/deploy/docker/utils.py
+++ b/deploy/docker/utils.py
@@ -137,7 +137,7 @@ async def is_pdf_url(url: str) -> bool:
     if url.lower().endswith(".pdf"):
         return True
 
-    timeout = httpx.Timeout(5.0)
+    timeout = httpx.Timeout(connect=5.0, read=10.0, write=5.0, pool=5.0)
     async with httpx.AsyncClient(follow_redirects=True, timeout=timeout) as client:
         # HEAD request to check Content-Type (ignore servers that reject HEAD)
         try:
@@ -150,9 +150,12 @@ async def is_pdf_url(url: str) -> bool:
 
         # Fallback: GET first 5 bytes to check PDF magic number
         try:
-            get_resp = await client.get(url, headers={"Range": "bytes=0-4"})
+            get_resp = await client.get(url, headers={"Range": "bytes=0-4", "Accept": "*/*"})
             if get_resp.status_code in (200, 206):  # 206 Partial Content
                 return get_resp.content.startswith(b"%PDF-")
         except httpx.HTTPError:
             return False
+        
+    # Default: not a PDF (or unable to determine)
+    return False
 

From ec2f88a1c1e61e2ec120b2d3d81a663e708bcc53 Mon Sep 17 00:00:00 2001
From: nicolas145 <nicolas@sofy.fr>
Date: Fri, 26 Sep 2025 10:34:11 -0400
Subject: [PATCH 16/21] fix: consistency

---
 docs/examples/docker/demo_docker_api.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/examples/docker/demo_docker_api.py b/docs/examples/docker/demo_docker_api.py
index a098fc4c9..542fb8980 100644
--- a/docs/examples/docker/demo_docker_api.py
+++ b/docs/examples/docker/demo_docker_api.py
@@ -1265,6 +1265,7 @@ async def demo_config_dump_invalid(client: httpx.AsyncClient):
 async def demo_pdf_crawl(client: httpx.AsyncClient):
     payload = {
         "urls": [PDF_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
         "crawler_config": {
             "type": "CrawlerRunConfig",
             "params": {
@@ -1304,6 +1305,7 @@ async def demo_pdf_crawl_stream(client: httpx.AsyncClient):
     """
     payload = {
         "urls": [PDF_URL],
+        "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
         "crawler_config": {
             "type": "CrawlerRunConfig",
             "params": {

From 96815673f1f74179072a4eeddd34534c9a7bc10a Mon Sep 17 00:00:00 2001
From: nicolas145 <nicolas@sofy.fr>
Date: Fri, 26 Sep 2025 10:36:48 -0400
Subject: [PATCH 17/21] fix: missing check

---
 deploy/docker/crawler_pool.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/deploy/docker/crawler_pool.py b/deploy/docker/crawler_pool.py
index 2af256563..8eb1ccec0 100644
--- a/deploy/docker/crawler_pool.py
+++ b/deploy/docker/crawler_pool.py
@@ -49,14 +49,13 @@ async def get_crawler(cfg: BrowserConfig, crawler_strategy: Optional[object] = N
     except Exception as e:
         raise RuntimeError(f"Failed to start browser: {e}")
     finally:
-        if sig and sig in POOL:
-            LAST_USED[sig] = time.time()
-        else:
-            # If we failed to start the browser, we should remove it from the pool
-            if sig:
-                POOL.pop(sig, None)
-                LAST_USED.pop(sig, None)
-        # If we failed to start the browser, we should remove it from the pool
+        async with LOCK:
+            if sig and sig in POOL:
+                LAST_USED[sig] = time.time()
+            else:
+                if sig:
+                    POOL.pop(sig, None)
+                    LAST_USED.pop(sig, None)
         
 async def close_all():
     async with LOCK:

From 87f35b88507e6b60c186811e6a8d8f1d1c135c54 Mon Sep 17 00:00:00 2001
From: nicolas145 <nicolas@sofy.fr>
Date: Fri, 26 Sep 2025 11:27:05 -0400
Subject: [PATCH 18/21] fix: dedupe concurrent creators

---
 deploy/docker/crawler_pool.py | 47 +++++++++++++++++++++++++++++------
 1 file changed, 39 insertions(+), 8 deletions(-)

diff --git a/deploy/docker/crawler_pool.py b/deploy/docker/crawler_pool.py
index 8eb1ccec0..4501466ea 100644
--- a/deploy/docker/crawler_pool.py
+++ b/deploy/docker/crawler_pool.py
@@ -30,25 +30,55 @@ def _sig(cfg: BrowserConfig, crawler_strategy: Optional[object]  = None) -> str:
     return hashlib.sha256(json_payload.encode()).hexdigest()
 
 
-async def get_crawler(cfg: BrowserConfig, crawler_strategy: Optional[object] = None) -> AsyncWebCrawler:
+# Track in-flight creations to dedupe concurrent get_crawler() calls
+CREATING: Dict[str, asyncio.Future] = {}
+
+async def get_crawler(
+    cfg: BrowserConfig, crawler_strategy: Optional[object] = None
+) -> AsyncWebCrawler:
     sig: Optional[str] = None
     try:
         sig = _sig(cfg, crawler_strategy=crawler_strategy)
+
+        # First pass under lock: reuse or join/create in-flight
         async with LOCK:
             if sig in POOL:
                 LAST_USED[sig] = time.time()
                 return POOL[sig]
-            if psutil.virtual_memory().percent >= MEM_LIMIT:
-                raise MemoryError("RAM pressure - new browser denied")
-            crawler = AsyncWebCrawler(config=cfg, thread_safe=False, crawler_strategy=crawler_strategy)
-            await crawler.start()
-            POOL[sig] = crawler; LAST_USED[sig] = time.time()
-            return crawler
-    except MemoryError as e:
+            fut = CREATING.get(sig)
+            if fut is None:
+                if psutil.virtual_memory().percent >= MEM_LIMIT:
+                    raise MemoryError("RAM pressure - new browser denied")
+                fut = asyncio.get_running_loop().create_future()
+                CREATING[sig] = fut
+
+        # Outside lock: create/start if we're the creator
+        if not fut.done():
+            try:
+                crawler = AsyncWebCrawler(
+                    config=cfg, thread_safe=False, crawler_strategy=crawler_strategy
+                )
+                await crawler.start()
+                async with LOCK:
+                    POOL[sig] = crawler
+                    LAST_USED[sig] = time.time()
+                fut.set_result(crawler)
+            except Exception as e:
+                fut.set_exception(e)
+                raise RuntimeError("Failed to start browser") from e
+            finally:
+                async with LOCK:
+                    CREATING.pop(sig, None)
+
+        # Await the shared result (crawler or the same exception)
+        return await fut
+
+    except MemoryError:
         raise
     except Exception as e:
         raise RuntimeError(f"Failed to start browser: {e}")
     finally:
+        # Update last-used if a crawler exists
         async with LOCK:
             if sig and sig in POOL:
                 LAST_USED[sig] = time.time()
@@ -56,6 +86,7 @@ async def get_crawler(cfg: BrowserConfig, crawler_strategy: Optional[object] = N
                 if sig:
                     POOL.pop(sig, None)
                     LAST_USED.pop(sig, None)
+
         
 async def close_all():
     async with LOCK:

From 2b80f98613b05f2ac1b998416fd0ff1da0ff378a Mon Sep 17 00:00:00 2001
From: nicolas145 <nicolas@sofy.fr>
Date: Fri, 26 Sep 2025 12:15:56 -0400
Subject: [PATCH 19/21] =?UTF-8?q?fix:=20dedup=20race:=20non=E2=80=91creato?=
 =?UTF-8?q?r=20callers=20may=20double=E2=80=91create=20and=20set=20the=20s?=
 =?UTF-8?q?ame=20Future?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 deploy/docker/__init__.py     |  0
 deploy/docker/crawler_pool.py | 24 ++++++++++++++++--------
 2 files changed, 16 insertions(+), 8 deletions(-)
 create mode 100644 deploy/docker/__init__.py

diff --git a/deploy/docker/__init__.py b/deploy/docker/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/deploy/docker/crawler_pool.py b/deploy/docker/crawler_pool.py
index 4501466ea..fa9beda5e 100644
--- a/deploy/docker/crawler_pool.py
+++ b/deploy/docker/crawler_pool.py
@@ -3,8 +3,7 @@
 from contextlib import suppress
 from typing import Dict, Optional
 from crawl4ai import AsyncWebCrawler, BrowserConfig
-from utils import load_config 
-
+from .utils import load_config
 
 CONFIG = load_config()
 
@@ -36,24 +35,34 @@ def _sig(cfg: BrowserConfig, crawler_strategy: Optional[object]  = None) -> str:
 async def get_crawler(
     cfg: BrowserConfig, crawler_strategy: Optional[object] = None
 ) -> AsyncWebCrawler:
+    """
+    Return a shared AsyncWebCrawler instance for the given config.
+    Only the 'creator' coroutine actually starts the crawler and sets
+    the future result to avoid InvalidStateError and double creation.
+    """
     sig: Optional[str] = None
     try:
         sig = _sig(cfg, crawler_strategy=crawler_strategy)
 
-        # First pass under lock: reuse or join/create in-flight
+        creator = False  # Track whether *this* caller will create the crawler
         async with LOCK:
+            # Reuse an existing crawler if available
             if sig in POOL:
                 LAST_USED[sig] = time.time()
                 return POOL[sig]
+
+            # Join in-flight creation if it exists
             fut = CREATING.get(sig)
             if fut is None:
+                # First caller becomes the creator
                 if psutil.virtual_memory().percent >= MEM_LIMIT:
-                    raise MemoryError("RAM pressure - new browser denied")
+                    raise MemoryError("RAM pressure – new browser denied")
                 fut = asyncio.get_running_loop().create_future()
                 CREATING[sig] = fut
+                creator = True
 
-        # Outside lock: create/start if we're the creator
-        if not fut.done():
+        if creator:
+            # Only the creator actually creates/starts the crawler
             try:
                 crawler = AsyncWebCrawler(
                     config=cfg, thread_safe=False, crawler_strategy=crawler_strategy
@@ -78,7 +87,7 @@ async def get_crawler(
     except Exception as e:
         raise RuntimeError(f"Failed to start browser: {e}")
     finally:
-        # Update last-used if a crawler exists
+        # Update last-used if a crawler now exists
         async with LOCK:
             if sig and sig in POOL:
                 LAST_USED[sig] = time.time()
@@ -86,7 +95,6 @@ async def get_crawler(
                 if sig:
                     POOL.pop(sig, None)
                     LAST_USED.pop(sig, None)
-
         
 async def close_all():
     async with LOCK:

From 703243dcb11343fd1d698e765d55aafb88f5e941 Mon Sep 17 00:00:00 2001
From: nicolas145 <nicolas@sofy.fr>
Date: Fri, 26 Sep 2025 14:29:43 -0400
Subject: [PATCH 20/21] Revert "fix: use safe callback"

This reverts commit bc3b3d14ac7a0cf9879b2c389afc6d4361f6e38a.
---
 docs/examples/docker/demo_docker_api.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/docs/examples/docker/demo_docker_api.py b/docs/examples/docker/demo_docker_api.py
index 542fb8980..a0898109b 100644
--- a/docs/examples/docker/demo_docker_api.py
+++ b/docs/examples/docker/demo_docker_api.py
@@ -25,7 +25,7 @@
 FORMS_URL = "https://httpbin.org/forms/post"  # For JS demo
 BOOKS_URL = "http://books.toscrape.com/"  # For CSS extraction
 PYTHON_URL = "https://python.org"  # For deeper crawl
-PDF_URL = "https://arxiv.org/pdf/2310.06825.pdf" # For PDF demo
+PDF_URL = "https://arxiv.org/pdf/2310.06825" # For PDF demo
 # Use the same sample site as deep crawl tests for consistency
 DEEP_CRAWL_BASE_URL = os.getenv(
     "DEEP_CRAWL_TEST_SITE", "https://docs.crawl4ai.com/samples/deepcrawl/")
@@ -1290,12 +1290,9 @@ async def demo_pdf_crawl(client: httpx.AsyncClient):
     print("Number of results:", len(data.get("results", [])))
     if data.get("results"):
         first = data["results"][0]
-        text = first.get("extracted_content") or first.get("text") or ""
-        if isinstance(text, dict):
-            text = text.get("text") or ""
+        text_snippet = (first.get("text") or "")[:500]
         print("Extracted text (first 500 chars):")
-        print((text or "")[:500])
-
+        print(text_snippet)
         
 # 11. Crawl PDF stream
 
@@ -1311,7 +1308,7 @@ async def demo_pdf_crawl_stream(client: httpx.AsyncClient):
             "params": {
                 "stream": True,
                 "cache_mode": "BYPASS",
-                "scraping_strategy": {
+                "scraping_strategy": {  # <-- Default strategy if not set
                     "type": "PDFContentScrapingStrategy",
                     "params": {
                         "extract_images": False,     

From 14994d88e93e622743a85cec1cec0ebf6b610bce Mon Sep 17 00:00:00 2001
From: nicolas145 <nicolas@sofy.fr>
Date: Fri, 26 Sep 2025 14:43:56 -0400
Subject: [PATCH 21/21] fix: revert to initial

---
 deploy/docker/__init__.py     |  0
 deploy/docker/crawler_pool.py | 77 +++++++++--------------------------
 2 files changed, 20 insertions(+), 57 deletions(-)
 delete mode 100644 deploy/docker/__init__.py

diff --git a/deploy/docker/__init__.py b/deploy/docker/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/deploy/docker/crawler_pool.py b/deploy/docker/crawler_pool.py
index fa9beda5e..4652c0d6d 100644
--- a/deploy/docker/crawler_pool.py
+++ b/deploy/docker/crawler_pool.py
@@ -3,7 +3,8 @@
 from contextlib import suppress
 from typing import Dict, Optional
 from crawl4ai import AsyncWebCrawler, BrowserConfig
-from .utils import load_config
+from utils import load_config 
+
 
 CONFIG = load_config()
 
@@ -29,72 +30,34 @@ def _sig(cfg: BrowserConfig, crawler_strategy: Optional[object]  = None) -> str:
     return hashlib.sha256(json_payload.encode()).hexdigest()
 
 
-# Track in-flight creations to dedupe concurrent get_crawler() calls
-CREATING: Dict[str, asyncio.Future] = {}
 
-async def get_crawler(
-    cfg: BrowserConfig, crawler_strategy: Optional[object] = None
-) -> AsyncWebCrawler:
-    """
-    Return a shared AsyncWebCrawler instance for the given config.
-    Only the 'creator' coroutine actually starts the crawler and sets
-    the future result to avoid InvalidStateError and double creation.
-    """
+async def get_crawler(cfg: BrowserConfig, crawler_strategy: Optional[object] = None) -> AsyncWebCrawler:
     sig: Optional[str] = None
     try:
         sig = _sig(cfg, crawler_strategy=crawler_strategy)
-
-        creator = False  # Track whether *this* caller will create the crawler
         async with LOCK:
-            # Reuse an existing crawler if available
             if sig in POOL:
-                LAST_USED[sig] = time.time()
+                LAST_USED[sig] = time.time();  
                 return POOL[sig]
-
-            # Join in-flight creation if it exists
-            fut = CREATING.get(sig)
-            if fut is None:
-                # First caller becomes the creator
-                if psutil.virtual_memory().percent >= MEM_LIMIT:
-                    raise MemoryError("RAM pressure – new browser denied")
-                fut = asyncio.get_running_loop().create_future()
-                CREATING[sig] = fut
-                creator = True
-
-        if creator:
-            # Only the creator actually creates/starts the crawler
-            try:
-                crawler = AsyncWebCrawler(
-                    config=cfg, thread_safe=False, crawler_strategy=crawler_strategy
-                )
-                await crawler.start()
-                async with LOCK:
-                    POOL[sig] = crawler
-                    LAST_USED[sig] = time.time()
-                fut.set_result(crawler)
-            except Exception as e:
-                fut.set_exception(e)
-                raise RuntimeError("Failed to start browser") from e
-            finally:
-                async with LOCK:
-                    CREATING.pop(sig, None)
-
-        # Await the shared result (crawler or the same exception)
-        return await fut
-
-    except MemoryError:
-        raise
+            if psutil.virtual_memory().percent >= MEM_LIMIT:
+                raise MemoryError("RAM pressure – new browser denied")
+            crawler = AsyncWebCrawler(config=cfg, thread_safe=False, crawler_strategy=crawler_strategy)
+            await crawler.start()
+            POOL[sig] = crawler; LAST_USED[sig] = time.time()
+            return crawler
+    except MemoryError as e:
+        raise MemoryError(f"RAM pressure – new browser denied: {e}")
     except Exception as e:
         raise RuntimeError(f"Failed to start browser: {e}")
     finally:
-        # Update last-used if a crawler now exists
-        async with LOCK:
-            if sig and sig in POOL:
-                LAST_USED[sig] = time.time()
-            else:
-                if sig:
-                    POOL.pop(sig, None)
-                    LAST_USED.pop(sig, None)
+        if sig and sig in POOL:
+            LAST_USED[sig] = time.time()
+        else:
+            # If we failed to start the browser, we should remove it from the pool
+            if sig:
+                POOL.pop(sig, None)
+                LAST_USED.pop(sig, None)
+        # If we failed to start the browser, we should remove it from the pool
         
 async def close_all():
     async with LOCK: