codex
This commit is contained in:
8
pricewatch/app/tasks/__init__.py
Executable file
8
pricewatch/app/tasks/__init__.py
Executable file
@@ -0,0 +1,8 @@
|
||||
"""
|
||||
Module tasks pour les jobs RQ.
|
||||
"""
|
||||
|
||||
from pricewatch.app.tasks.scrape import scrape_product
|
||||
from pricewatch.app.tasks.scheduler import ScrapingScheduler
|
||||
|
||||
__all__ = ["scrape_product", "ScrapingScheduler"]
|
||||
75
pricewatch/app/tasks/scheduler.py
Executable file
75
pricewatch/app/tasks/scheduler.py
Executable file
@@ -0,0 +1,75 @@
|
||||
"""
|
||||
Planification des jobs de scraping via RQ Scheduler.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import Optional
|
||||
|
||||
import redis
|
||||
from rq import Queue
|
||||
from rq_scheduler import Scheduler
|
||||
|
||||
from pricewatch.app.core.config import AppConfig, get_config
|
||||
from pricewatch.app.core.logging import get_logger
|
||||
from pricewatch.app.tasks.scrape import scrape_product
|
||||
|
||||
logger = get_logger("tasks.scheduler")
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScheduledJobInfo:
|
||||
"""Infos de retour pour un job planifie."""
|
||||
|
||||
job_id: str
|
||||
next_run: datetime
|
||||
|
||||
|
||||
class ScrapingScheduler:
|
||||
"""Scheduler pour les jobs de scraping avec RQ."""
|
||||
|
||||
def __init__(self, config: Optional[AppConfig] = None, queue_name: str = "default") -> None:
|
||||
self.config = config or get_config()
|
||||
self.redis = redis.from_url(self.config.redis.url)
|
||||
self.queue = Queue(queue_name, connection=self.redis)
|
||||
self.scheduler = Scheduler(queue=self.queue, connection=self.redis)
|
||||
|
||||
def enqueue_immediate(
|
||||
self,
|
||||
url: str,
|
||||
use_playwright: Optional[bool] = None,
|
||||
save_db: bool = True,
|
||||
):
|
||||
"""Enqueue un job immediat."""
|
||||
job = self.queue.enqueue(
|
||||
scrape_product,
|
||||
url,
|
||||
use_playwright=use_playwright,
|
||||
save_db=save_db,
|
||||
)
|
||||
logger.info(f"Job enqueued: {job.id}")
|
||||
return job
|
||||
|
||||
def schedule_product(
|
||||
self,
|
||||
url: str,
|
||||
interval_hours: int = 24,
|
||||
use_playwright: Optional[bool] = None,
|
||||
save_db: bool = True,
|
||||
) -> ScheduledJobInfo:
|
||||
"""Planifie un scraping recurrent (intervalle en heures)."""
|
||||
interval_seconds = int(timedelta(hours=interval_hours).total_seconds())
|
||||
next_run = datetime.now(timezone.utc) + timedelta(seconds=interval_seconds)
|
||||
|
||||
job = self.scheduler.schedule(
|
||||
scheduled_time=next_run,
|
||||
func=scrape_product,
|
||||
args=[url],
|
||||
kwargs={"use_playwright": use_playwright, "save_db": save_db},
|
||||
interval=interval_seconds,
|
||||
repeat=None,
|
||||
)
|
||||
logger.info(f"Job planifie: {job.id}, prochaine execution: {next_run.isoformat()}")
|
||||
return ScheduledJobInfo(job_id=job.id, next_run=next_run)
|
||||
160
pricewatch/app/tasks/scrape.py
Executable file
160
pricewatch/app/tasks/scrape.py
Executable file
@@ -0,0 +1,160 @@
|
||||
"""
|
||||
Tache de scraping asynchrone pour RQ.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Optional
|
||||
|
||||
from pricewatch.app.core.config import AppConfig, get_config
|
||||
from pricewatch.app.core.logging import get_logger
|
||||
from pricewatch.app.core.registry import get_registry
|
||||
from pricewatch.app.core.schema import DebugInfo, DebugStatus, FetchMethod, ProductSnapshot
|
||||
from pricewatch.app.scraping.http_fetch import fetch_http
|
||||
from pricewatch.app.scraping.pipeline import ScrapingPipeline
|
||||
from pricewatch.app.scraping.pw_fetch import fetch_playwright
|
||||
from pricewatch.app.stores.aliexpress.store import AliexpressStore
|
||||
from pricewatch.app.stores.amazon.store import AmazonStore
|
||||
from pricewatch.app.stores.backmarket.store import BackmarketStore
|
||||
from pricewatch.app.stores.cdiscount.store import CdiscountStore
|
||||
|
||||
logger = get_logger("tasks.scrape")
|
||||
|
||||
|
||||
def setup_stores() -> None:
|
||||
"""Enregistre les stores disponibles si besoin."""
|
||||
registry = get_registry()
|
||||
if registry.list_stores():
|
||||
return
|
||||
registry.register(AmazonStore())
|
||||
registry.register(CdiscountStore())
|
||||
registry.register(BackmarketStore())
|
||||
registry.register(AliexpressStore())
|
||||
|
||||
|
||||
def scrape_product(
|
||||
url: str,
|
||||
use_playwright: Optional[bool] = None,
|
||||
save_db: bool = True,
|
||||
save_html: bool = False,
|
||||
save_screenshot: bool = False,
|
||||
headful: bool = False,
|
||||
timeout_ms: Optional[int] = None,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Scrape un produit et persiste en base via ScrapingPipeline.
|
||||
|
||||
Retourne un dict avec success, product_id, snapshot, error.
|
||||
"""
|
||||
config: AppConfig = get_config()
|
||||
setup_stores()
|
||||
|
||||
if use_playwright is None:
|
||||
use_playwright = config.default_use_playwright
|
||||
|
||||
if timeout_ms is None:
|
||||
timeout_ms = config.default_playwright_timeout
|
||||
|
||||
registry = get_registry()
|
||||
store = registry.detect_store(url)
|
||||
if not store:
|
||||
snapshot = ProductSnapshot(
|
||||
source="unknown",
|
||||
url=url,
|
||||
debug=DebugInfo(
|
||||
method=FetchMethod.HTTP,
|
||||
status=DebugStatus.FAILED,
|
||||
errors=["Aucun store detecte"],
|
||||
),
|
||||
)
|
||||
ScrapingPipeline(config=config).process_snapshot(snapshot, save_to_db=save_db)
|
||||
return {"success": False, "product_id": None, "snapshot": snapshot, "error": "store"}
|
||||
|
||||
canonical_url = store.canonicalize(url)
|
||||
|
||||
html = None
|
||||
fetch_method = FetchMethod.HTTP
|
||||
fetch_error = None
|
||||
duration_ms = None
|
||||
html_size_bytes = None
|
||||
pw_result = None
|
||||
|
||||
http_result = fetch_http(canonical_url)
|
||||
duration_ms = http_result.duration_ms
|
||||
|
||||
if http_result.success:
|
||||
html = http_result.html
|
||||
fetch_method = FetchMethod.HTTP
|
||||
elif use_playwright:
|
||||
pw_result = fetch_playwright(
|
||||
canonical_url,
|
||||
headless=not headful,
|
||||
timeout_ms=timeout_ms,
|
||||
save_screenshot=save_screenshot,
|
||||
)
|
||||
duration_ms = pw_result.duration_ms
|
||||
|
||||
if pw_result.success:
|
||||
html = pw_result.html
|
||||
fetch_method = FetchMethod.PLAYWRIGHT
|
||||
else:
|
||||
fetch_error = pw_result.error
|
||||
else:
|
||||
fetch_error = http_result.error
|
||||
|
||||
if html:
|
||||
html_size_bytes = len(html.encode("utf-8"))
|
||||
if save_html:
|
||||
from pricewatch.app.core.io import save_debug_html
|
||||
|
||||
ref = store.extract_reference(canonical_url) or "unknown"
|
||||
save_debug_html(html, f"{store.store_id}_{ref}")
|
||||
|
||||
if save_screenshot and fetch_method == FetchMethod.PLAYWRIGHT and pw_result:
|
||||
from pricewatch.app.core.io import save_debug_screenshot
|
||||
|
||||
if pw_result and pw_result.screenshot:
|
||||
ref = store.extract_reference(canonical_url) or "unknown"
|
||||
save_debug_screenshot(pw_result.screenshot, f"{store.store_id}_{ref}")
|
||||
|
||||
try:
|
||||
snapshot = store.parse(html, canonical_url)
|
||||
snapshot.debug.method = fetch_method
|
||||
snapshot.debug.duration_ms = duration_ms
|
||||
snapshot.debug.html_size_bytes = html_size_bytes
|
||||
success = snapshot.debug.status != DebugStatus.FAILED
|
||||
except Exception as exc:
|
||||
snapshot = ProductSnapshot(
|
||||
source=store.store_id,
|
||||
url=canonical_url,
|
||||
debug=DebugInfo(
|
||||
method=fetch_method,
|
||||
status=DebugStatus.FAILED,
|
||||
errors=[f"Parsing failed: {exc}"],
|
||||
duration_ms=duration_ms,
|
||||
html_size_bytes=html_size_bytes,
|
||||
),
|
||||
)
|
||||
success = False
|
||||
fetch_error = str(exc)
|
||||
else:
|
||||
snapshot = ProductSnapshot(
|
||||
source=store.store_id,
|
||||
url=canonical_url,
|
||||
debug=DebugInfo(
|
||||
method=fetch_method,
|
||||
status=DebugStatus.FAILED,
|
||||
errors=[f"Fetch failed: {fetch_error or 'Unknown error'}"],
|
||||
duration_ms=duration_ms,
|
||||
),
|
||||
)
|
||||
success = False
|
||||
|
||||
product_id = ScrapingPipeline(config=config).process_snapshot(snapshot, save_to_db=save_db)
|
||||
|
||||
return {
|
||||
"success": success,
|
||||
"product_id": product_id,
|
||||
"snapshot": snapshot,
|
||||
"error": fetch_error,
|
||||
}
|
||||
Reference in New Issue
Block a user