import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

def download_assets(html_file, output_root):
    if not os.path.exists(html_file):
        print(f"File {html_file} not found.")
        return

    with open(html_file, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, 'html.parser')

    # Base URL for relative paths (if <base> exists)
    base_tag = soup.find('base')
    base_url = base_tag['href'] if base_tag and base_tag.has_attr('href') else ""
    
    # Asset types to download
    tags = {
        'img': 'src',
        'link': 'href',
        'script': 'src',
        'source': 'src',
        'video': 'src'
    }

    print(f"Listing and downloading assets from {html_file}...")

    for tag_name, attr in tags.items():
        for tag in soup.find_all(tag_name, **{attr: True}):
            url = tag[attr]
            
            # Skip data URIs
            if url.startswith("data:"): continue
            
            # Resolve relative URLs
            full_url = urljoin(base_url, url) if not url.startswith("http") else url
            
            # Get path and extension
            parsed = urlparse(full_url)
            filename = os.path.basename(parsed.path)
            if not filename: continue
            
            ext = filename.split(".")[-1].lower() if "." in filename else ""
            sub_dir = ""
            if ext in ["css"]: sub_dir = "css"
            elif ext in ["js"]: sub_dir = "js"
            elif ext in ["png", "jpg", "jpeg", "gif", "svg", "ico"]: sub_dir = "img"
            elif ext in ["woff", "woff2", "ttf", "eot"]: sub_dir = "fonts"
            
            # Create local path
            dir_path = os.path.join(output_root, "local_assets", sub_dir)
            if not os.path.exists(dir_path):
                os.makedirs(dir_path)
            
            local_file = os.path.join(dir_path, filename)
            rel_path = f"./local_assets/{sub_dir}/{filename}" if sub_dir else f"./local_assets/{filename}"
            
            # Download
            if not os.path.exists(local_file):
                print(f"Downloading {full_url} -> {rel_path}")
                try:
                    r = requests.get(full_url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
                    if r.status_code == 200:
                        with open(local_file, "wb") as f:
                            f.write(r.content)
                        tag[attr] = rel_path
                    else:
                        print(f"  Failed: {r.status_code}")
                except Exception as e:
                    print(f"  Error: {e}")
            else:
                tag[attr] = rel_path

    # Save updated HTML
    with open(html_file, "w", encoding="utf-8") as f:
        f.write(soup.prettify())
    print("Assets downloaded and HTML references updated.")

if __name__ == "__main__":
    # You can change the file name here
    target = "index.html"
    download_assets(target, ".")
