scrap/pricewatch/app/tasks/scrape.py

"""
Tache de scraping asynchrone pour RQ.
"""

from __future__ import annotations

import time
from typing import Any, Optional

from pricewatch.app.core.config import AppConfig, get_config
from pricewatch.app.core.logging import get_logger
from pricewatch.app.core.registry import get_registry
from pricewatch.app.core.schema import DebugInfo, DebugStatus, FetchMethod, ProductSnapshot
from pricewatch.app.scraping.http_fetch import fetch_http
from pricewatch.app.scraping.pipeline import ScrapingPipeline
from pricewatch.app.scraping.pw_fetch import fetch_playwright
from pricewatch.app.stores.aliexpress.store import AliexpressStore
from pricewatch.app.stores.amazon.store import AmazonStore
from pricewatch.app.stores.backmarket.store import BackmarketStore
from pricewatch.app.stores.cdiscount.store import CdiscountStore

logger = get_logger("tasks.scrape")


def setup_stores() -> None:
    """Enregistre les stores disponibles si besoin."""
    registry = get_registry()
    if registry.list_stores():
        return
    registry.register(AmazonStore())
    registry.register(CdiscountStore())
    registry.register(BackmarketStore())
    registry.register(AliexpressStore())


def scrape_product(
    url: str,
    use_playwright: Optional[bool] = None,
    save_db: bool = True,
    save_html: bool = False,
    save_screenshot: bool = False,
    headful: bool = False,
    timeout_ms: Optional[int] = None,
) -> dict[str, Any]:
    """
    Scrape un produit et persiste en base via ScrapingPipeline.

    Retourne un dict avec success, product_id, snapshot, error.
    """
    job_start_time = time.time()
    logger.info(f"[JOB START] Scraping: {url}")

    config: AppConfig = get_config()
    setup_stores()

    if use_playwright is None:
        use_playwright = config.default_use_playwright

    if timeout_ms is None:
        timeout_ms = config.default_playwright_timeout

    registry = get_registry()
    store = registry.detect_store(url)
    if not store:
        elapsed_ms = int((time.time() - job_start_time) * 1000)
        logger.error(f"[JOB FAILED] Aucun store detecte pour: {url} (duree={elapsed_ms}ms)")
        snapshot = ProductSnapshot(
            source="unknown",
            url=url,
            debug=DebugInfo(
                method=FetchMethod.HTTP,
                status=DebugStatus.FAILED,
                errors=["Aucun store detecte"],
            ),
        )
        ScrapingPipeline(config=config).process_snapshot(snapshot, save_to_db=save_db)
        return {"success": False, "product_id": None, "snapshot": snapshot, "error": "store"}

    logger.info(f"[STORE] Detecte: {store.store_id}")

    canonical_url = store.canonicalize(url)

    html = None
    fetch_method = FetchMethod.HTTP
    fetch_error = None
    duration_ms = None
    html_size_bytes = None
    pw_result = None

    logger.debug(f"[FETCH] Tentative HTTP: {canonical_url}")
    http_result = fetch_http(canonical_url)
    duration_ms = http_result.duration_ms

    if http_result.success:
        html = http_result.html
        fetch_method = FetchMethod.HTTP
        logger.info(f"[FETCH] HTTP OK (duree={duration_ms}ms, taille={len(html)})")
    elif use_playwright:
        logger.debug(f"[FETCH] HTTP echoue ({http_result.error}), fallback Playwright")
        pw_result = fetch_playwright(
            canonical_url,
            headless=not headful,
            timeout_ms=timeout_ms,
            save_screenshot=save_screenshot,
        )
        duration_ms = pw_result.duration_ms

        if pw_result.success:
            html = pw_result.html
            fetch_method = FetchMethod.PLAYWRIGHT
            logger.info(f"[FETCH] Playwright OK (duree={duration_ms}ms, taille={len(html)})")
        else:
            fetch_error = pw_result.error
            logger.warning(f"[FETCH] Playwright echoue: {fetch_error}")
    else:
        fetch_error = http_result.error
        logger.warning(f"[FETCH] HTTP echoue: {fetch_error}")

    if html:
        html_size_bytes = len(html.encode("utf-8"))
        if save_html:
            from pricewatch.app.core.io import save_debug_html

            ref = store.extract_reference(canonical_url) or "unknown"
            save_debug_html(html, f"{store.store_id}_{ref}")

        if save_screenshot and fetch_method == FetchMethod.PLAYWRIGHT and pw_result:
            from pricewatch.app.core.io import save_debug_screenshot

            if pw_result and pw_result.screenshot:
                ref = store.extract_reference(canonical_url) or "unknown"
                save_debug_screenshot(pw_result.screenshot, f"{store.store_id}_{ref}")

        try:
            logger.debug(f"[PARSE] Parsing avec {store.store_id}...")
            snapshot = store.parse(html, canonical_url)
            snapshot.debug.method = fetch_method
            snapshot.debug.duration_ms = duration_ms
            snapshot.debug.html_size_bytes = html_size_bytes
            success = snapshot.debug.status != DebugStatus.FAILED
            if success:
                logger.info(f"[PARSE] OK - titre={bool(snapshot.title)}, prix={snapshot.price}")
            else:
                logger.warning(f"[PARSE] Partiel - status={snapshot.debug.status}")
        except Exception as exc:
            logger.error(f"[PARSE] Exception: {exc}")
            snapshot = ProductSnapshot(
                source=store.store_id,
                url=canonical_url,
                debug=DebugInfo(
                    method=fetch_method,
                    status=DebugStatus.FAILED,
                    errors=[f"Parsing failed: {exc}"],
                    duration_ms=duration_ms,
                    html_size_bytes=html_size_bytes,
                ),
            )
            success = False
            fetch_error = str(exc)
        # Si captcha detecte via HTTP, forcer une tentative Playwright.
        if (
            fetch_method == FetchMethod.HTTP
            and use_playwright
            and snapshot.debug.errors
            and any("captcha" in error.lower() for error in snapshot.debug.errors)
        ):
            logger.info("[FETCH] Captcha detecte, tentative Playwright")
            pw_result = fetch_playwright(
                canonical_url,
                headless=not headful,
                timeout_ms=timeout_ms,
                save_screenshot=save_screenshot,
            )
            if pw_result.success and pw_result.html:
                try:
                    snapshot = store.parse(pw_result.html, canonical_url)
                    snapshot.debug.method = FetchMethod.PLAYWRIGHT
                    snapshot.debug.duration_ms = pw_result.duration_ms
                    snapshot.debug.html_size_bytes = len(pw_result.html.encode("utf-8"))
                    snapshot.add_note("Captcha detecte via HTTP, fallback Playwright")
                    success = snapshot.debug.status != DebugStatus.FAILED
                except Exception as exc:
                    snapshot.add_note(f"Fallback Playwright echoue: {exc}")
                    logger.error(f"[PARSE] Exception fallback Playwright: {exc}")
                    fetch_error = str(exc)
            else:
                error = pw_result.error or "Erreur Playwright"
                snapshot.add_note(f"Fallback Playwright echoue: {error}")
                fetch_error = error
    else:
        snapshot = ProductSnapshot(
            source=store.store_id,
            url=canonical_url,
            debug=DebugInfo(
                method=fetch_method,
                status=DebugStatus.FAILED,
                errors=[f"Fetch failed: {fetch_error or 'Unknown error'}"],
                duration_ms=duration_ms,
            ),
        )
        success = False

    product_id = ScrapingPipeline(config=config).process_snapshot(snapshot, save_to_db=save_db)

    # Log final du job
    elapsed_ms = int((time.time() - job_start_time) * 1000)
    if success:
        logger.info(
            f"[JOB OK] {store.store_id}/{snapshot.reference} "
            f"product_id={product_id} prix={snapshot.price} duree={elapsed_ms}ms"
        )
    else:
        logger.warning(
            f"[JOB FAILED] {store.store_id}/{snapshot.reference or 'unknown'} "
            f"erreur={fetch_error} duree={elapsed_ms}ms"
        )

    return {
        "success": success,
        "product_id": product_id,
        "snapshot": snapshot,
        "error": fetch_error,
    }