scrap/tests/stores/test_aliexpress.py

#!/usr/bin/env python3
"""Tests pour le store AliExpress."""

import pytest
from pathlib import Path

from pricewatch.app.stores.aliexpress.store import AliexpressStore


class TestAliexpressStore:
    """Tests pour AliexpressStore."""

    @pytest.fixture
    def store(self):
        """Fixture du store AliExpress."""
        return AliexpressStore()

    # ========== Tests de match() ==========

    def test_match_aliexpress_com_product(self, store):
        """URL aliexpress.com/item/ reconnue comme produit."""
        url = "https://www.aliexpress.com/item/1005007187023722.html"
        score = store.match(url)
        assert score == 0.9

    def test_match_aliexpress_fr_product(self, store):
        """URL fr.aliexpress.com/item/ reconnue comme produit."""
        url = "https://fr.aliexpress.com/item/1005007187023722.html"
        score = store.match(url)
        assert score == 0.9

    def test_match_aliexpress_non_product(self, store):
        """URL aliexpress.com mais pas /item/ → score réduit."""
        url = "https://www.aliexpress.com/category/electronics"
        score = store.match(url)
        assert score == 0.5

    def test_match_other_site(self, store):
        """Autres sites non reconnus."""
        urls = [
            "https://www.amazon.fr/dp/ASIN",
            "https://www.cdiscount.com/f-123-abc.html",
            "",
            None,
        ]
        for url in urls:
            if url is not None:
                score = store.match(url)
                assert score == 0.0

    def test_match_case_insensitive(self, store):
        """Match insensible à la casse."""
        url = "https://FR.ALIEXPRESS.COM/ITEM/1234567890.HTML"
        score = store.match(url)
        assert score == 0.9

    # ========== Tests de canonicalize() ==========

    def test_canonicalize_remove_query_params(self, store):
        """Canonicalize retire les paramètres de query."""
        url = "https://fr.aliexpress.com/item/1005007187023722.html?spm=a2g0o.detail.0.0"
        canonical = store.canonicalize(url)
        assert canonical == "https://fr.aliexpress.com/item/1005007187023722.html"

    def test_canonicalize_remove_fragment(self, store):
        """Canonicalize retire le fragment (#)."""
        url = "https://fr.aliexpress.com/item/1005007187023722.html#reviews"
        canonical = store.canonicalize(url)
        assert canonical == "https://fr.aliexpress.com/item/1005007187023722.html"

    def test_canonicalize_keep_item_path(self, store):
        """Canonicalize garde le chemin /item/{ID}.html."""
        url = "https://fr.aliexpress.com/item/1005007187023722.html"
        canonical = store.canonicalize(url)
        assert canonical == "https://fr.aliexpress.com/item/1005007187023722.html"

    def test_canonicalize_empty_url(self, store):
        """Canonicalize avec URL vide retourne la même."""
        assert store.canonicalize("") == ""
        assert store.canonicalize(None) is None

    # ========== Tests de extract_reference() ==========

    def test_extract_reference_standard_format(self, store):
        """Extraction du SKU depuis format standard /item/{ID}.html."""
        url = "https://fr.aliexpress.com/item/1005007187023722.html"
        ref = store.extract_reference(url)
        assert ref == "1005007187023722"

    def test_extract_reference_with_query_params(self, store):
        """Extraction du SKU ignore les paramètres de query."""
        url = "https://fr.aliexpress.com/item/1005007187023722.html?param=value"
        ref = store.extract_reference(url)
        assert ref == "1005007187023722"

    def test_extract_reference_different_domain(self, store):
        """Extraction du SKU fonctionne avec différents domaines."""
        url = "https://www.aliexpress.com/item/9876543210987.html"
        ref = store.extract_reference(url)
        assert ref == "9876543210987"

    def test_extract_reference_invalid_url(self, store):
        """Extraction du SKU depuis URL invalide retourne None."""
        urls = [
            "https://www.aliexpress.com/category/electronics",
            "https://www.aliexpress.com/",
            "",
            None,
        ]
        for url in urls:
            ref = store.extract_reference(url)
            assert ref is None

    # ========== Tests de parse() ==========

    def test_parse_basic_html_with_title(self, store):
        """Parse HTML basique avec h1."""
        html = """
        <html>
        <head>
            <meta property="og:title" content="Samsung DDR4 RAM - AliExpress">
        </head>
        <body>
            <h1>Samsung DDR4 RAM Server Memory</h1>
        </body>
        </html>
        """
        url = "https://fr.aliexpress.com/item/1005007187023722.html"
        snapshot = store.parse(html, url)

        assert snapshot.source == "aliexpress"
        assert snapshot.url == "https://fr.aliexpress.com/item/1005007187023722.html"
        assert snapshot.title == "Samsung DDR4 RAM Server Memory"
        assert snapshot.reference == "1005007187023722"
        assert snapshot.currency == "EUR"  # fr.aliexpress → EUR

    def test_parse_title_from_meta_og(self, store):
        """Parse titre depuis og:title quand pas de h1."""
        html = """
        <html>
        <head>
            <meta property="og:title" content="Product Name - AliExpress">
        </head>
        <body>
        </body>
        </html>
        """
        url = "https://www.aliexpress.com/item/1234567890.html"
        snapshot = store.parse(html, url)

        assert snapshot.title == "Product Name"  # "- AliExpress" retiré
        assert snapshot.currency == "USD"  # .com → USD

    def test_parse_price_from_regex(self, store):
        """Parse prix depuis regex dans le HTML."""
        html = """
        <html>
        <head>
            <meta property="og:title" content="Test Product - AliExpress">
        </head>
        <body>
            <h1>Test Product</h1>
            <div class="price-container">
                <span>Prix: 99,99 €</span>
            </div>
        </body>
        </html>
        """
        url = "https://fr.aliexpress.com/item/1234567890.html"
        snapshot = store.parse(html, url)

        assert snapshot.price == 99.99
        assert snapshot.currency == "EUR"

    def test_parse_price_euro_before(self, store):
        """Parse prix avec € avant le nombre."""
        html = """
        <html>
        <head><meta property="og:title" content="Test - AliExpress"></head>
        <body>
            <h1>Test</h1>
            <span>€ 125.50</span>
        </body>
        </html>
        """
        url = "https://fr.aliexpress.com/item/1234567890.html"
        snapshot = store.parse(html, url)

        assert snapshot.price == 125.50

    def test_parse_images_from_dcdata(self, store):
        """Parse images depuis window._d_c_.DCData."""
        html = """
        <html>
        <head><meta property="og:title" content="Test - AliExpress"></head>
        <body>
            <h1>Test</h1>
            <script>
                window._d_c_ = window._d_c_ || {};
                window._d_c_.DCData = {
                    "imagePathList": [
                        "https://ae01.alicdn.com/kf/image1.jpg",
                        "https://ae01.alicdn.com/kf/image2.jpg"
                    ]
                };
            </script>
        </body>
        </html>
        """
        url = "https://fr.aliexpress.com/item/1234567890.html"
        snapshot = store.parse(html, url)

        assert len(snapshot.images) == 2
        assert snapshot.images[0] == "https://ae01.alicdn.com/kf/image1.jpg"
        assert snapshot.images[1] == "https://ae01.alicdn.com/kf/image2.jpg"
        assert any("DCData" in note for note in snapshot.debug.notes)

    def test_parse_images_from_og_fallback(self, store):
        """Parse images depuis og:image en fallback."""
        html = """
        <html>
        <head>
            <meta property="og:title" content="Test - AliExpress">
            <meta property="og:image" content="https://ae01.alicdn.com/kf/product.jpg">
        </head>
        <body>
            <h1>Test</h1>
        </body>
        </html>
        """
        url = "https://fr.aliexpress.com/item/1234567890.html"
        snapshot = store.parse(html, url)

        assert len(snapshot.images) == 1
        assert snapshot.images[0] == "https://ae01.alicdn.com/kf/product.jpg"

    def test_parse_missing_title_and_price(self, store):
        """Parse avec titre et prix manquants → status PARTIAL."""
        html = "<html><body><p>Empty content</p></body></html>"
        url = "https://fr.aliexpress.com/item/1234567890.html"
        snapshot = store.parse(html, url)

        assert snapshot.title is None
        assert snapshot.price is None
        assert not snapshot.is_complete()
        assert snapshot.debug.status == "partial"

    def test_parse_small_html_warning(self, store):
        """Parse avec HTML petit génère un warning."""
        html = "<html><head><title>Test</title></head><body></body></html>"
        url = "https://fr.aliexpress.com/item/1234567890.html"
        snapshot = store.parse(html, url)

        # HTML < 200KB devrait générer une note
        assert any("non rendu" in note.lower() for note in snapshot.debug.notes)

    def test_parse_stock_status_in_stock(self, store):
        """Parse détecte in_stock depuis le bouton add to cart."""
        html = """
        <html>
        <head><meta property="og:title" content="Test - AliExpress"></head>
        <body>
            <h1>Test</h1>
            <button class="add-to-cart-btn">Add to Cart</button>
        </body>
        </html>
        """
        url = "https://fr.aliexpress.com/item/1234567890.html"
        snapshot = store.parse(html, url)

        assert snapshot.stock_status == "in_stock"