Files
scrap/tests/scraping/test_pw_fetch.py
Gilles Soulier d0b73b9319 codex2
2026-01-14 21:54:55 +01:00

389 lines
13 KiB
Python

"""
Tests pour pricewatch.app.scraping.pw_fetch
Teste la récupération Playwright avec mocks pour éviter de lancer vraiment un navigateur.
"""
from unittest.mock import Mock, patch
import pytest
from playwright.sync_api import TimeoutError as PlaywrightTimeout
from pricewatch.app.scraping.pw_fetch import (
PlaywrightFetchResult,
fetch_playwright,
fetch_with_fallback,
)
class TestPlaywrightFetchResult:
"""Tests pour la classe PlaywrightFetchResult."""
def test_success_result(self):
"""Création d'un résultat réussi."""
result = PlaywrightFetchResult(
success=True,
html="<html>Test</html>",
screenshot=b"fake_screenshot_bytes",
duration_ms=2500,
)
assert result.success is True
assert result.html == "<html>Test</html>"
assert result.screenshot == b"fake_screenshot_bytes"
assert result.error is None
assert result.duration_ms == 2500
def test_error_result(self):
"""Création d'un résultat d'erreur."""
result = PlaywrightFetchResult(
success=False,
error="Timeout",
screenshot=b"error_screenshot",
duration_ms=3000,
)
assert result.success is False
assert result.html is None
assert result.error == "Timeout"
assert result.screenshot == b"error_screenshot"
assert result.duration_ms == 3000
def test_minimal_result(self):
"""Résultat minimal."""
result = PlaywrightFetchResult(success=False)
assert result.success is False
assert result.html is None
assert result.screenshot is None
assert result.error is None
assert result.duration_ms is None
class TestFetchPlaywright:
"""Tests pour fetch_playwright()."""
@pytest.fixture
def mock_playwright_stack(self, mocker):
"""Fixture: Mock complet de la stack Playwright."""
# Mock de la page
mock_page = Mock()
mock_page.content.return_value = "<html><body>Playwright Test</body></html>"
mock_page.screenshot.return_value = b"fake_screenshot_data"
mock_page.goto.return_value = Mock(status=200)
# Mock du context
mock_context = Mock()
mock_context.new_page.return_value = mock_page
# Mock du browser
mock_browser = Mock()
mock_browser.new_context.return_value = mock_context
# Mock playwright chromium
mock_chromium = Mock()
mock_chromium.launch.return_value = mock_browser
# Mock playwright
mock_playwright_obj = Mock()
mock_playwright_obj.chromium = mock_chromium
# Mock sync_playwright().start()
mock_sync_playwright = Mock()
mock_sync_playwright.start.return_value = mock_playwright_obj
mocker.patch(
"pricewatch.app.scraping.pw_fetch.sync_playwright",
return_value=mock_sync_playwright,
)
return {
"playwright": mock_playwright_obj,
"browser": mock_browser,
"context": mock_context,
"page": mock_page,
}
def test_fetch_success(self, mock_playwright_stack):
"""Récupération Playwright réussie."""
result = fetch_playwright("https://example.com")
assert result.success is True
assert result.html == "<html><body>Playwright Test</body></html>"
assert result.screenshot is None # Par défaut pas de screenshot
assert result.error is None
assert result.duration_ms is not None
assert result.duration_ms >= 0
# Vérifier que la page a été visitée
mock_playwright_stack["page"].goto.assert_called_once_with(
"https://example.com", wait_until="domcontentloaded"
)
def test_fetch_with_screenshot(self, mock_playwright_stack):
"""Récupération avec screenshot."""
result = fetch_playwright("https://example.com", save_screenshot=True)
assert result.success is True
assert result.screenshot == b"fake_screenshot_data"
# Vérifier que screenshot() a été appelé
mock_playwright_stack["page"].screenshot.assert_called_once()
def test_fetch_headful_mode(self, mock_playwright_stack):
"""Mode headful (navigateur visible)."""
result = fetch_playwright("https://example.com", headless=False)
assert result.success is True
# Vérifier que headless=False a été passé
mock_playwright_stack["playwright"].chromium.launch.assert_called_once()
call_kwargs = mock_playwright_stack["playwright"].chromium.launch.call_args.kwargs
assert call_kwargs["headless"] is False
def test_fetch_with_custom_timeout(self, mock_playwright_stack):
"""Timeout personnalisé."""
result = fetch_playwright("https://example.com", timeout_ms=30000)
assert result.success is True
# Vérifier que set_default_timeout a été appelé
mock_playwright_stack["page"].set_default_timeout.assert_called_once_with(30000)
def test_fetch_with_wait_for_selector(self, mock_playwright_stack):
"""Attente d'un sélecteur CSS spécifique."""
result = fetch_playwright(
"https://example.com", wait_for_selector=".product-title"
)
assert result.success is True
# Vérifier que wait_for_selector a été appelé
mock_playwright_stack["page"].wait_for_selector.assert_called_once_with(
".product-title", timeout=60000
)
def test_fetch_wait_for_selector_timeout(self, mock_playwright_stack):
"""Timeout lors de l'attente du sélecteur."""
# Le sélecteur timeout mais la page continue
mock_playwright_stack["page"].wait_for_selector.side_effect = PlaywrightTimeout(
"Selector timeout"
)
result = fetch_playwright(
"https://example.com", wait_for_selector=".non-existent"
)
# Doit quand même réussir (le wait_for_selector est non-bloquant)
assert result.success is True
assert result.html is not None
def test_fetch_empty_url(self):
"""URL vide retourne une erreur."""
result = fetch_playwright("")
assert result.success is False
assert "URL vide" in result.error
assert result.html is None
def test_fetch_whitespace_url(self):
"""URL avec espaces retourne une erreur."""
result = fetch_playwright(" ")
assert result.success is False
assert "URL vide" in result.error
def test_fetch_no_response_from_server(self, mock_playwright_stack):
"""Pas de réponse du serveur."""
mock_playwright_stack["page"].goto.return_value = None
result = fetch_playwright("https://example.com")
assert result.success is False
assert "Pas de réponse du serveur" in result.error
def test_fetch_playwright_timeout(self, mock_playwright_stack):
"""Timeout Playwright lors de la navigation."""
mock_playwright_stack["page"].goto.side_effect = PlaywrightTimeout(
"Navigation timeout"
)
result = fetch_playwright("https://example.com", timeout_ms=10000)
assert result.success is False
assert "Timeout" in result.error
assert result.duration_ms is not None
def test_fetch_playwright_generic_error(self, mock_playwright_stack):
"""Erreur générique Playwright."""
mock_playwright_stack["page"].goto.side_effect = Exception(
"Generic Playwright error"
)
result = fetch_playwright("https://example.com")
assert result.success is False
assert "Erreur Playwright" in result.error
assert result.duration_ms is not None
def test_fetch_cleanup_on_success(self, mock_playwright_stack):
"""Nettoyage des ressources sur succès."""
result = fetch_playwright("https://example.com")
assert result.success is True
# Vérifier que les ressources sont nettoyées
mock_playwright_stack["page"].close.assert_called_once()
mock_playwright_stack["browser"].close.assert_called_once()
mock_playwright_stack["playwright"].stop.assert_called_once()
def test_fetch_cleanup_on_error(self, mock_playwright_stack):
"""Nettoyage des ressources sur erreur."""
mock_playwright_stack["page"].goto.side_effect = Exception("Test error")
result = fetch_playwright("https://example.com")
assert result.success is False
# Vérifier que les ressources sont nettoyées même en cas d'erreur
mock_playwright_stack["page"].close.assert_called_once()
mock_playwright_stack["browser"].close.assert_called_once()
mock_playwright_stack["playwright"].stop.assert_called_once()
def test_fetch_screenshot_on_error(self, mock_playwright_stack):
"""Screenshot capturé même en cas d'erreur."""
mock_playwright_stack["page"].goto.side_effect = PlaywrightTimeout("Timeout")
result = fetch_playwright("https://example.com", save_screenshot=True)
assert result.success is False
assert result.screenshot == b"fake_screenshot_data"
# Screenshot doit avoir été tenté
mock_playwright_stack["page"].screenshot.assert_called_once()
class TestFetchWithFallback:
"""Tests pour fetch_with_fallback()."""
def test_http_success_no_playwright(self, mocker):
"""Si HTTP réussit, Playwright n'est pas appelé."""
# Mock fetch_http qui réussit
mock_http_result = Mock()
mock_http_result.success = True
mock_http_result.html = "<html>HTTP Success</html>"
mock_http_result.duration_ms = 150
mocker.patch(
"pricewatch.app.scraping.http_fetch.fetch_http",
return_value=mock_http_result,
)
# Mock fetch_playwright (ne devrait pas être appelé)
mock_playwright = mocker.patch(
"pricewatch.app.scraping.pw_fetch.fetch_playwright"
)
result = fetch_with_fallback("https://example.com")
assert result.success is True
assert result.html == "<html>HTTP Success</html>"
assert result.duration_ms == 150
# Playwright ne doit pas être appelé
mock_playwright.assert_not_called()
def test_http_fails_playwright_fallback(self, mocker):
"""Si HTTP échoue, fallback vers Playwright."""
# Mock fetch_http qui échoue
mock_http_result = Mock()
mock_http_result.success = False
mock_http_result.error = "403 Forbidden"
mocker.patch(
"pricewatch.app.scraping.http_fetch.fetch_http",
return_value=mock_http_result,
)
# Mock fetch_playwright qui réussit
mock_playwright_result = PlaywrightFetchResult(
success=True,
html="<html>Playwright Success</html>",
duration_ms=2500,
)
mock_playwright = mocker.patch(
"pricewatch.app.scraping.pw_fetch.fetch_playwright",
return_value=mock_playwright_result,
)
result = fetch_with_fallback("https://example.com")
assert result.success is True
assert result.html == "<html>Playwright Success</html>"
# Playwright doit avoir été appelé
mock_playwright.assert_called_once()
def test_skip_http_direct_playwright(self, mocker):
"""Mode Playwright direct (sans essayer HTTP d'abord)."""
# Mock fetch_http (ne devrait pas être appelé)
mock_http = mocker.patch("pricewatch.app.scraping.http_fetch.fetch_http")
# Mock fetch_playwright
mock_playwright_result = PlaywrightFetchResult(
success=True,
html="<html>Playwright Direct</html>",
duration_ms=2500,
)
mock_playwright = mocker.patch(
"pricewatch.app.scraping.pw_fetch.fetch_playwright",
return_value=mock_playwright_result,
)
result = fetch_with_fallback("https://example.com", try_http_first=False)
assert result.success is True
assert result.html == "<html>Playwright Direct</html>"
# HTTP ne doit pas être appelé
mock_http.assert_not_called()
# Playwright doit avoir été appelé
mock_playwright.assert_called_once()
def test_playwright_options_passed(self, mocker):
"""Options Playwright passées correctement."""
# Mock fetch_http qui échoue
mock_http_result = Mock()
mock_http_result.success = False
mock_http_result.error = "403 Forbidden"
mocker.patch(
"pricewatch.app.scraping.http_fetch.fetch_http",
return_value=mock_http_result,
)
# Mock fetch_playwright
mock_playwright_result = PlaywrightFetchResult(
success=True,
html="<html>OK</html>",
duration_ms=2500,
)
mock_playwright = mocker.patch(
"pricewatch.app.scraping.pw_fetch.fetch_playwright",
return_value=mock_playwright_result,
)
# Options personnalisées
options = {"headless": False, "timeout_ms": 30000, "save_screenshot": True}
result = fetch_with_fallback("https://example.com", playwright_options=options)
assert result.success is True
# Vérifier que les options sont passées à fetch_playwright
mock_playwright.assert_called_once_with("https://example.com", **options)