import json
import time
import random
import re
import logging
import os
import csv
import atexit
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup
import pandas as pd
import mysql.connector

# ----------------------------
# CONFIG
# ----------------------------
OUTPUT_FILE = "BuildVideos.json"
PARTNER_URLS_FILE = os.path.join(os.path.dirname(__file__), "partnerUrls.csv")
MAX_RETRIES = 3
TIMEOUT_SEC = 20
VPN_ENABLED = True
WG_CONFIG_DIR = os.environ.get("WG_CONFIG_DIR", "/etc/wireguard")
WG_COOLDOWN_SEC = float(os.environ.get("WG_COOLDOWN_SEC", "2.0"))
VPN_ROTATOR = None

# Параметри підключення до MySQL
# DB_CONFIG = {
#     "host": "10.8.0.1",
#     "user": "integration",
#     "password": "?Q8/{lVK2N08Y<b>k",
#     "database": "Salsify",
# }

# QUERY = """
# SELECT SL_MD.`SKU`,MS_BU.`Account SKU`, SL_BU.`Build SSKU #`, SL_BU.`Build Item #`
# FROM Salsify.MainData AS SL_MD
# 		LEFT JOIN Salsify.Build  AS SL_BU ON SL_BU.SKU = SL_MD.SKU
# 		LEFT JOIN MySamm.Build AS MS_BU ON MS_BU.SKU = SL_MD.SKU
# WHERE (MS_BU.SKU LIKE '%' AND  (SL_MD.Status)='Active' AND MS_BU.`Account SKU` IS NOT NULL)
#     OR 
# (MS_BU.SKU LIKE '%' AND (SL_MD.Status)='Liquidation' AND MS_BU.`Account SKU` IS NOT NULL)
# """

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

MANIFEST_RE = re.compile(r"\.m3u8(\?|$)", re.I)
FILE_RE = re.compile(r"\.(mp4|webm)(\?|$)", re.I)

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

try:
    from wg_vpn import WireGuardRotator
except Exception:
    WireGuardRotator = None


def load_partner_urls(csv_path: str, column_name: str) -> list[tuple[str, str]]:
    items = []
    with open(csv_path, newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            sku = (row.get("SKU") or "").strip()
            url = (row.get(column_name) or "").strip()
            if not url or url.upper() == "NULL":
                continue
            items.append((sku, url))
    return items


def extract_apollo_state(html: str) -> dict:
    marker = "window.__APOLLO_STATE__="
    idx = html.find(marker)
    if idx == -1:
        return {}
    start = html.find("{", idx)
    if start == -1:
        return {}
    depth = 0
    in_string = False
    escape = False
    end = None
    for i in range(start, len(html)):
        ch = html[i]
        if in_string:
            if escape:
                escape = False
            elif ch == "\\":
                escape = True
            elif ch == '"':
                in_string = False
        else:
            if ch == '"':
                in_string = True
            elif ch == "{":
                depth += 1
            elif ch == "}":
                depth -= 1
                if depth == 0:
                    end = i + 1
                    break
    if end is None:
        return {}
    try:
        return json.loads(html[start:end])
    except Exception:
        return {}


def collect_wistia_hashkeys(state: dict) -> set[str]:
    keys = set()

    def walk(value):
        if isinstance(value, dict):
            if value.get("__typename") == "Video" and value.get("hashKey"):
                keys.add(value["hashKey"])
            for v in value.values():
                walk(v)
        elif isinstance(value, list):
            for v in value:
                walk(v)

    walk(state)
    return keys


def fetch_wistia_media(hash_key: str) -> dict | None:
    url = f"https://fast.wistia.com/embed/medias/{hash_key}.jsonp"
    try:
        response = request_with_vpn(url, headers=HEADERS, timeout=TIMEOUT_SEC)
        response.raise_for_status()
    except Exception as e:
        logging.warning(f"Wistia fetch failed for {hash_key}: {e}")
        return None

    text = response.text
    start = text.find("{")
    end = text.rfind("}")
    if start == -1 or end == -1 or end <= start:
        return None
    try:
        payload = json.loads(text[start:end + 1])
    except Exception:
        return None
    return payload.get("media")


def build_download_url(asset_url: str | None) -> str | None:
    if not asset_url:
        return None
    if asset_url.endswith(".bin"):
        return asset_url[:-4] + "/file.mp4"
    return asset_url

def fetch_urls_from_db():
    try:
        connection = mysql.connector.connect(**DB_CONFIG)
    except mysql.connector.Error as e:
        logging.error(f"DB connection error: {e}")
        return [], []

    try:
        cursor = connection.cursor(dictionary=True)
        cursor.execute(QUERY)
        results = cursor.fetchall()
        df_input = pd.DataFrame(results)
        df_input["Full_URL"] = (
            "https://www.build.com/vigo-" + df_input["Build SSKU #"].astype(str) + "?uid=" + df_input["Build Item #"].astype(str)
        )
        urls = df_input["Full_URL"].dropna().tolist()
        skus = df_input["SKU"].tolist()
        return urls, skus
    finally:
        try:
            cursor.close()
            connection.close()
        except Exception:
            pass


def collect_media_urls_from_html(html: str) -> set[str]:
    urls = set()
    for match in re.findall(r"https?://[^\s'\"]+", html):
        if MANIFEST_RE.search(match) or FILE_RE.search(match):
            urls.add(match)
    return urls


def collect_video_sources_from_html(html: str) -> set[str]:
    soup = BeautifulSoup(html, "html.parser")
    urls = set()
    for video in soup.find_all("video"):
        for source in video.find_all("source"):
            src = source.get("src")
            if isinstance(src, str):
                urls.add(src)
    return urls


def build_video_list(urls: set[str]) -> list[dict]:
    videos = []
    for url_val in sorted(urls):
        file_name = None
        try:
            file_name = urlparse(url_val).path.split("/")[-1] or None
        except Exception:
            file_name = None
        videos.append({
            "url": url_val,
            "type": "m3u8" if MANIFEST_RE.search(url_val) else "mp4/webm" if FILE_RE.search(url_val) else None,
            "file_name": file_name,
        })
    return videos


def request_with_vpn(url: str, headers: dict, timeout: int | float):
    try:
        response = requests.get(url, headers=headers, timeout=timeout)
    except Exception as e:
        if VPN_ENABLED and VPN_ROTATOR is not None:
            VPN_ROTATOR.rotate(f"request error: {e}")
        raise
    if VPN_ENABLED and response.status_code != 200:
        if VPN_ROTATOR is not None:
            VPN_ROTATOR.rotate(f"HTTP {response.status_code} for {url}")
        raise requests.HTTPError(f"HTTP {response.status_code} for {url}", response=response)
    return response


def process_url(url: str) -> list[dict]:
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            response = request_with_vpn(url, headers=HEADERS, timeout=TIMEOUT_SEC)
            response.raise_for_status()
            html = response.text
            with open("debug_build_pdp.html", "w", encoding="utf-8") as f:
                f.write(html)
            captured = collect_media_urls_from_html(html)
            captured |= collect_video_sources_from_html(html)

            state = extract_apollo_state(html)
            hash_keys = collect_wistia_hashkeys(state)
            wistia_videos = []
            for hash_key in sorted(hash_keys):
                media = fetch_wistia_media(hash_key)
                if not media:
                    continue
                assets = media.get("assets") or []
                original = next((a for a in assets if a.get("type") == "original"), None)
                asset_url = original.get("url") if isinstance(original, dict) else None
                wistia_videos.append({
                    "hashKey": hash_key,
                    "original_url": asset_url,
                    "download_url": build_download_url(asset_url),
                    "name": media.get("name"),
                })

            videos = build_video_list(captured)
            for item in wistia_videos:
                if item.get("download_url"):
                    videos.append({
                        "url": item["download_url"],
                        "type": "mp4/webm",
                        "file_name": urlparse(item["download_url"]).path.split("/")[-1],
                        "hashKey": item.get("hashKey"),
                        "original_url": item.get("original_url"),
                        "name": item.get("name"),
                    })

            return videos
        except Exception as e:
            logging.warning(f"Error for {url} (attempt {attempt}/{MAX_RETRIES}): {e}")
            time.sleep(2 + random.uniform(1, 3))

    logging.error(f"Failed to fetch {url} after {MAX_RETRIES} attempts")
    return []


def main():
    global VPN_ROTATOR
    if VPN_ENABLED and WireGuardRotator is not None:
        VPN_ROTATOR = WireGuardRotator(WG_CONFIG_DIR, cooldown_sec=WG_COOLDOWN_SEC)
        VPN_ROTATOR.ensure_up()
        atexit.register(VPN_ROTATOR.shutdown)

    pairs = load_partner_urls(PARTNER_URLS_FILE, "Build URL")
    urls = [p[1] for p in pairs]
    skus = [p[0] for p in pairs]
    if not urls:
        logging.warning("No URLs loaded")
        return
    print(len(skus))
    out = []
    for i, url in enumerate(urls):
        videos = process_url(url)
        row = {
            "page_url": url,
            "SKU": skus[i] if i < len(skus) else None,
            "videos": videos,
        }
        out.append(row)
        logging.info(f"Processed {i+1}/{len(urls)}{skus[i]} -> {len(videos)} videos")
        time.sleep(random.uniform(1, 5))

    output_path = os.environ.get("OUTPUT_PATH") or OUTPUT_FILE
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(out, f, indent=2, ensure_ascii=False)

    print(json.dumps(out, indent=2, ensure_ascii=False))
    logging.info(f"Saved -> {output_path}")


if __name__ == "__main__":
    main()
