Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion scraper/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,13 @@ def get_stats(self, days: int = 7) -> Dict[str, Any]:
result = dict(stats)
result['cache_hit_rate'] = (result['cache_hits'] / result['total_scrapes'] * 100) if result['total_scrapes'] > 0 else 0
result['error_rate'] = (result['errors'] / result['total_scrapes'] * 100) if result['total_scrapes'] > 0 else 0
result['model_usage'] = [dict(row) for row in model_stats]

# Format model usage as a list of dicts, as expected by the TUI
model_usage_list = [
{"model": row["model"], "count": row["count"]}
for row in model_stats
]
result['model_usage'] = model_usage_list

return result

Expand Down
39 changes: 28 additions & 11 deletions scraper/tui_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from scrapegraphai.graphs import SmartScraperGraph

from .fallback import ModelFallbackExecutor, ModelConfig, DEFAULT_FALLBACK_CHAIN
from .ratelimit import RateLimiter, RateLimitConfig, RATE_LIMIT_PRESETS
from .ratelimit import RateLimiter, RATE_LIMIT_PRESETS
from .cache import ScraperCache
from .metrics import MetricsDB
from .models import SCHEMAS, validate_data
Expand Down Expand Up @@ -72,7 +72,8 @@ async def scrape_single_url(

Returns:
Tuple of (result_data, metadata)
metadata includes: execution_time, model_used, fallback_attempts, cached, validation_passed
metadata includes: execution_time, model_used, fallback_attempts,
cached, validation_passed
"""
start_time = asyncio.get_event_loop().time()
cached = False
Expand All @@ -88,9 +89,13 @@ async def scrape_single_url(
}

if use_cache and self.cache.enabled:
cached_result = self.cache.get(url, prompt, **cache_key_params)
cached_result = self.cache.get(
url, prompt, **cache_key_params
)
if cached_result:
execution_time = asyncio.get_event_loop().time() - start_time
execution_time = (
asyncio.get_event_loop().time() - start_time
)
return cached_result, {
'execution_time': execution_time,
'model_used': model,
Expand Down Expand Up @@ -149,16 +154,24 @@ async def scrape_single_url(

# Validate if schema provided
if schema_name and schema_name in SCHEMAS:
is_valid, validated_data, error_msg = validate_data(result, schema_name)
is_valid, validated_data, error_msg = validate_data(
result, schema_name
)
validation_passed = is_valid
if is_valid:
result = validated_data.model_dump() if hasattr(validated_data, 'model_dump') else dict(validated_data)
result = (
validated_data.model_dump()
if hasattr(validated_data, 'model_dump')
else dict(validated_data)
)
else:
validation_passed = None

# Cache result
if use_cache and self.cache.enabled:
self.cache.set(url, prompt, result, ttl_hours=24, **cache_key_params)
self.cache.set(
url, prompt, result, ttl_hours=24, **cache_key_params
)

execution_time = asyncio.get_event_loop().time() - start_time

Expand All @@ -169,7 +182,9 @@ async def scrape_single_url(
model=model_used,
execution_time=execution_time,
fallback_attempts=fallback_attempts,
validation_passed=validation_passed if validation_passed is not None else True,
validation_passed=(
validation_passed if validation_passed is not None else True
),
cached=cached,
schema_used=schema_name,
)
Expand Down Expand Up @@ -315,10 +330,12 @@ def get_recent_scrapes(self, limit: int = 20) -> List[Dict[str, Any]]:
Returns:
List of scrape records
"""
# TODO: Implement this in MetricsDB
return []
recent_metrics = self.metrics_db.get_recent(limit=limit)
return [metric.to_dict() for metric in recent_metrics]

async def check_ollama_connection(self, base_url: str = "http://localhost:11434") -> bool:
async def check_ollama_connection(
self, base_url: str = "http://localhost:11434"
) -> bool:
"""Check if Ollama is running

Args:
Expand Down
Loading