"""Flashscore parser.

Important notes
---------------
Flashscore pages are JS-heavy and have anti-bot measures. In production the
most reliable approach is Playwright (headless Chromium) with:
  - realistic User-Agent
  - small jittered delays
  - caching to reduce hit rate

This parser provides two backends:
  * aiohttp + BeautifulSoup (fast; works when HTML contains data)
  * Playwright (slower; works for JS-rendered pages)

The worker chooses Playwright by default (PARSER_USE_PLAYWRIGHT=True).
"""

from __future__ import annotations

import asyncio
import json
import random
import re
import time
from dataclasses import dataclass
from typing import Any

import aiohttp
from bs4 import BeautifulSoup

from config import settings


_UA = (
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)


@dataclass
class _CacheEntry:
    ts: float
    html: str


class FlashscoreParser:
    def __init__(self):
        self._cache: dict[str, _CacheEntry] = {}
        self._lock = asyncio.Lock()

    async def _sleep_jitter(self) -> None:
        base = float(settings.PARSER_MIN_DELAY_S)
        await asyncio.sleep(base + random.random() * 0.35)

    def _cache_get(self, url: str) -> str | None:
        e = self._cache.get(url)
        if not e:
            return None
        if time.time() - e.ts > settings.PARSER_CACHE_TTL_S:
            self._cache.pop(url, None)
            return None
        return e.html

    def _cache_set(self, url: str, html: str) -> None:
        self._cache[url] = _CacheEntry(ts=time.time(), html=html)

    async def fetch_html(self, url: str) -> str:
        cached = self._cache_get(url)
        if cached:
            return cached

        # Prevent stampedes to the same endpoint
        async with self._lock:
            cached = self._cache_get(url)
            if cached:
                return cached

            await self._sleep_jitter()

            if settings.PARSER_USE_PLAYWRIGHT:
                html = await self._fetch_playwright(url)
            else:
                html = await self._fetch_aiohttp(url)
            self._cache_set(url, html)
            return html

    async def _fetch_aiohttp(self, url: str) -> str:
        timeout = aiohttp.ClientTimeout(total=30)
        headers = {"User-Agent": _UA, "Accept-Language": "en-US,en;q=0.9"}
        async with aiohttp.ClientSession(timeout=timeout, headers=headers) as session:
            async with session.get(url, allow_redirects=True) as r:
                r.raise_for_status()
                return await r.text()

    async def _fetch_playwright(self, url: str) -> str:
        # Import lazily: Playwright is heavy.
        from playwright.async_api import async_playwright  # type: ignore

        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=settings.PARSER_HEADLESS, args=[
                "--no-sandbox",
                "--disable-dev-shm-usage",
                "--disable-blink-features=AutomationControlled",
            ])
            context = await browser.new_context(user_agent=_UA, locale="en-US")
            page = await context.new_page()
            try:
                await page.goto(url, wait_until="domcontentloaded", timeout=45000)
                # Some pages load content after DOMContentLoaded
                await page.wait_for_timeout(1200)
                return await page.content()
            finally:
                await context.close()
                await browser.close()

    # ----------------- Parsing helpers -----------------

    _match_href_re = re.compile(r"/match/[^/]+/")

    def _extract_match_links(self, html: str, base_url: str = "https://www.flashscore.com") -> list[str]:
        soup = BeautifulSoup(html, "lxml")
        links: set[str] = set()
        for a in soup.find_all("a", href=True):
            href = a.get("href") or ""
            if self._match_href_re.search(href):
                if href.startswith("http"):
                    links.add(href)
                else:
                    links.add(base_url.rstrip("/") + href)
        return sorted(links)

    def _extract_teams_from_anchor_text(self, a_text: str) -> tuple[str | None, str | None]:
        # Common UI is "Team A - Team B" (dash variations)
        for sep in [" — ", " - ", " – ", " vs "]:
            if sep in a_text:
                left, right = a_text.split(sep, 1)
                left = left.strip()
                right = right.strip()
                if left and right:
                    return left, right
        return None, None

    def _stable_match_id(self, match_url: str) -> str:
        # Use the last path segment when available; fallback to full URL hash.
        # Example: https://www.flashscore.com/match/football/<id>/
        m = re.search(r"/match/[^/]+/([^/]+)/", match_url)
        if m:
            return m.group(1)
        return re.sub(r"\W+", "_", match_url)[-64:]

    # ----------------- Public API used by worker -----------------

    async def fetch_competition_matches(self, competition_url: str) -> list[dict[str, Any]]:
        """Parse competition page and return match stubs.

        Output dict keys align with `database.models.Match`.
        """
        html = await self.fetch_html(competition_url)
        soup = BeautifulSoup(html, "lxml")

        out: list[dict[str, Any]] = []
        seen: set[str] = set()

        # Heuristic: anchors containing /match/ are the most stable piece.
        for a in soup.find_all("a", href=True):
            href = a.get("href") or ""
            if not self._match_href_re.search(href):
                continue
            match_url = href if href.startswith("http") else "https://www.flashscore.com" + href
            match_url = match_url.split("#")[0]
            if match_url in seen:
                continue
            seen.add(match_url)

            # Teams + possibly score/status: try from surrounding text.
            # Using parent container text improves chance to capture score shown near the link.
            container = a.parent
            container_text = " ".join(container.stripped_strings) if container else ""
            text = container_text or " ".join(a.stripped_strings)
            home, away = self._extract_teams_from_anchor_text(text)

            # Score if present on the competition page (results sections often show it)
            hs = as_ = None
            mscore = re.search(r"\b(\d{1,2})\s*[:\-]\s*(\d{1,2})\b", text)
            if mscore:
                try:
                    hs = int(mscore.group(1))
                    as_ = int(mscore.group(2))
                except Exception:
                    hs = as_ = None

            status = "scheduled"
            if hs is not None and as_ is not None:
                status = "finished"

            out.append(
                {
                    "id": self._stable_match_id(match_url),
                    "source": "flashscore",
                    "competition_url": competition_url,
                    "match_url": match_url,
                    "kickoff_ts": None,
                    "status": status,
                    "home_team": home or "TBD",
                    "away_team": away or "TBD",
                    "minute": None,
                    "home_score": hs,
                    "away_score": as_,
                }
            )

        # Deduplicate and keep deterministic order.
        uniq: dict[str, dict[str, Any]] = {m["id"]: m for m in out}
        return list(uniq.values())

    async def fetch_match_detail_update(self, match_url: str) -> dict[str, Any]:
        """Fetch match page and extract lightweight live fields.

        Returns: {status, minute, home_score, away_score}
        """
        html = await self.fetch_html(match_url)
        soup = BeautifulSoup(html, "lxml")

        # Score: attempt common score containers.
        text = soup.get_text(" ", strip=True)

        # minute like "67'"
        minute = None
        m = re.search(r"\b(\d{1,3})\s*'", text)
        if m:
            try:
                minute = int(m.group(1))
            except Exception:
                minute = None

        # score like "2 - 1" or "2:1"
        hs = as_ = None
        m2 = re.search(r"\b(\d{1,2})\s*[-:]\s*(\d{1,2})\b", text)
        if m2:
            try:
                hs = int(m2.group(1))
                as_ = int(m2.group(2))
            except Exception:
                hs = as_ = None

        # Status: rough mapping
        status = "live" if minute is not None else "scheduled"
        if re.search(r"\bFT\b|Full\s*Time|Finished", text, re.I):
            status = "finished"

        return {"status": status, "minute": minute, "home_score": hs, "away_score": as_}
