import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import json
import random
import logging
import undetected_chromedriver as uc
from urllib.parse import urljoin
import mysql.connector
import os
import atexit

VPN_ENABLED = os.environ.get("WG_ENABLED") == "1"
WG_CONFIG_DIR = os.environ.get("WG_CONFIG_DIR", "/etc/wireguard")
WG_COOLDOWN_SEC = float(os.environ.get("WG_COOLDOWN_SEC", "2.0"))
VPN_ROTATOR = None

try:
    from wg_vpn import (
        WireGuardRotator,
        parse_document_status_from_performance_logs,
        should_rotate_on_status,
        is_dns_error_html,
    )
except Exception:
    WireGuardRotator = None
    parse_document_status_from_performance_logs = None
    should_rotate_on_status = None
    is_dns_error_html = None

db_config = {
    "host": "104.236.70.14",  
    "user": "integration",     
    "password": "?Q8/{lVK2N08Y<b>k", 
    "database": "Salsify"  
}

# Підключення до бази даних
try:
    connection = mysql.connector.connect(**db_config)
    # print("Підключення до бази даних успішне.")
except mysql.connector.Error as e:
    # print(f"Помилка підключення: {e}")
    exit()


query = """
SELECT DISTINCT dt.SKU, dt.`Lowes URL` FROM ProductTracker.DailyTracker dt 
INNER JOIN Salsify.MainData md ON dt.SKU = md.SKU 
WHERE (dt.Date = CURRENT_DATE OR dt.Date = CURRENT_DATE - 1) AND dt.`Lowes URL` IS NOT NULL AND md.Status IN ('Active', 'Liquidation')
"""

try:

    cursor = connection.cursor(dictionary=True)
    cursor.execute(query)
    results = cursor.fetchall()

    df_input = pd.DataFrame(results)

    # Створення списків
    urls = df_input['Lowes URL'].dropna().tolist()
    additional_data = df_input['SKU'].tolist()

finally:
    if connection.is_connected():
        cursor.close()
        connection.close()


logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


# Список для збору даних
all_data = []

# Максимальна кількість спроб підключення
max_retries = 3
timeout = 20


headers = {
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Referer": "https://www.lowes.com/",
    "Connection": "keep-alive",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Referer": "https://www.lowes.com/",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-User": "?1",
    "Cache-Control": "max-age=0",
    "TE": "trailers",
    "DNT": "1",  
}

def getDriver(): 
    options = uc.ChromeOptions()
    options.add_argument("--no-sandbox")  # Required for some environments
    options.add_argument("--disable-blink-features=AutomationControlled")  # Hide automation
    # options.add_argument("--disable-dev-shm-usage")  # Fix /dev/shm issue in Docker
    options.add_argument("--disable-gpu")  # Disable GPU acceleration
    options.add_argument("--headless=new")  # Run in headless mode
    options.add_argument("--start-maximized")  # Open maximized window
    options.add_argument("--disable-extensions")  # Disable extensions
    options.add_argument("--disable-infobars")
    options.set_capability("goog:loggingPrefs", {"performance": "ALL"})
    return uc.Chrome(options=options, use_subprocess=True)


def ensure_driver_ready(driver, max_attempts: int = 15):
    for _ in range(max_attempts):
        try:
            _ = driver.current_url
            driver.get("about:blank")
            driver.execute_cdp_cmd("Network.enable", {})
            time.sleep(0.5)
            return driver
        except Exception:
            try:
                driver.quit()
            except Exception:
                pass
            time.sleep(1.5)
            driver = getDriver()
    return driver

if VPN_ENABLED and WireGuardRotator is not None:
    VPN_ROTATOR = WireGuardRotator(WG_CONFIG_DIR, cooldown_sec=WG_COOLDOWN_SEC)
    VPN_ROTATOR.ensure_up()
    atexit.register(VPN_ROTATOR.shutdown)
driver = ensure_driver_ready(getDriver())



# Функція для обробки одного URL
def process_url(url):

    for attempt in range(max_retries):
        try:
            logging.error(f"Підключення до {url}, спроба {attempt + 1}...")
            driver.execute_cdp_cmd("Network.clearBrowserCache", {})
            driver.execute_cdp_cmd("Network.clearBrowserCookies", {})
            driver.get(url);
            time.sleep(4)
            if VPN_ENABLED and VPN_ROTATOR is not None and parse_document_status_from_performance_logs is not None:
                try:
                    status = parse_document_status_from_performance_logs(driver.get_log("performance"), url)
                    if should_rotate_on_status is not None and should_rotate_on_status(status):
                        VPN_ROTATOR.rotate(f"HTTP {status} for {url}")
                        continue
                except Exception:
                    pass

            soup = BeautifulSoup(driver.page_source, 'html.parser')
            if VPN_ENABLED and VPN_ROTATOR is not None and is_dns_error_html is not None:
                try:
                    if is_dns_error_html(driver.page_source):
                        VPN_ROTATOR.rotate(f"DNS error for {url}")
                        continue
                except Exception:
                    pass

            links = soup.find_all('a', class_="LinkBase-sc-2ngoxx-0 eViSzl backyard link size--small color--black")
            
            pdf_links = []
            for link in links:
                # Витягуємо href
                href = link.get('href', '-')
                if href.endswith('.pdf'):
                    formatted_href = urljoin("https:", href).replace(" ", "")

                    span = link.find('span', class_="label link-label")
                    text = span.get_text(strip=True) if span else "No label"

                    pdf_links.append({'PDF Link': formatted_href, 'PDF Text': text})

            if not pdf_links:
                pdf_links.append({'PDF Link': 'PDF not found', 'PDF Text': 'PDF not found'})

            return pdf_links

        except requests.exceptions.RequestException as e:
            # logging.warning(f"Помилка для {url}: {e}")
            if VPN_ENABLED and VPN_ROTATOR is not None:
                VPN_ROTATOR.rotate(f"exception: {e}")
            time.sleep(2 + random.uniform(1, 3))
    return []

# Обробка URL-адрес





for i, url in enumerate(urls):
    if i % 15 == 0 and i != 0:
        driver.quit()
        time.sleep(3)
        driver = ensure_driver_ready(getDriver())
    data = process_url(url)
    if data:
        # Якщо для одного URL є кілька результатів, додаємо їх у окремі стовпці
        # row = {'Base URL': urls[i], 'SKU': additional_data[i]}
        row = {'Base URL': urls[i]}
        for j, item in enumerate(data):
            row[f"PDF Link{j+1}"] = item['PDF Link']
            row[f"PDF Text{j+1}"] = item['PDF Text']
        all_data.append(row)
        logging.error(f"Processed {i} of {len(urls)}")
    time.sleep(random.uniform(3, 6))  # Затримка для уникнення блокування

# Створення DataFrame
# df_output = pd.DataFrame(all_data)

output_path = os.environ.get("OUTPUT_PATH")
if output_path:
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(json.dumps(all_data))
else:
    print(json.dumps(all_data))
driver.quit()

# Збереження в Excel
output_file = "LowesPDF.json"
with open(output_file, "w") as json_file:
    json.dump(all_data, json_file, indent=4)
logging.info(f"Дані збережені в {output_file}")
