"""
Download ALL building photos from NYCU Yangming campus GIS server.
Queries loadImage endpoint for every building, then downloads each image.
"""

import json
import os
import re
import time
import warnings
from pathlib import Path
from urllib.parse import urljoin, urlparse, parse_qs

import requests
requests.packages.urllib3.disable_warnings()
warnings.filterwarnings('ignore')

# --- Configuration ---
BASE_URL = "https://ymspace.ga.nycu.edu.tw/gisweb"
LOAD_IMAGE_URL = f"{BASE_URL}/public/buildinfo.htm"
LIST_IMG_URL = f"{BASE_URL}/public/uploadfiles.htm"
OUTPUT_DIR = Path(r"C:\Users\thc1006\Desktop\NQSD\新增資料夾\data\ymmap_archive\building_photos_v2")
RAW_RESPONSES_DIR = OUTPUT_DIR / "raw_responses"
IMAGES_DIR = OUTPUT_DIR / "images"
SUMMARY_FILE = OUTPUT_DIR / "summary.json"

# All 48 building IDs
BUILDING_IDS = [
    "Y001", "B013", "B020", "P004", "B019", "B003", "B005", "B029",
    "P006", "P003", "Y012", "B010", "B009", "B004", "P005", "B017",
    "B016", "B012", "B011", "B015", "B021", "B022", "B014", "B018",
    "B023", "G005", "Y002", "B033", "B028", "Y004", "Y005", "B032",
    "B034", "B030", "B025", "B026", "G002", "B027", "B031", "B024",
    "Y003", "G022", "Y011", "Y007", "Y006", "Y008", "Y010", "Y009",
]

# Known floor lists per building (partial - we'll try 1F for all others)
BUILDING_FLOORS = {
    "Y001": ["R2", "R1", "6F", "5F", "4F", "3F", "2F", "1F", "B1"],
    "B005": ["RF", "6F", "5F", "4F", "3F", "2F", "1F", "B1", "4M"],
    "P003": ["RF", "9F", "8F", "7F", "6F", "5F", "4F", "3F", "2F", "1F", "B1", "B2"],
}

# Common floors to try for buildings without known floor lists
DEFAULT_FLOORS = ["1F"]

SESSION = requests.Session()
SESSION.verify = False
SESSION.headers.update({
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "zh-TW,zh;q=0.9,en;q=0.8",
    "Referer": "https://ymspace.ga.nycu.edu.tw/gisweb/public/buildinfo.htm",
})


def ensure_dirs():
    """Create output directories."""
    RAW_RESPONSES_DIR.mkdir(parents=True, exist_ok=True)
    IMAGES_DIR.mkdir(parents=True, exist_ok=True)


def extract_image_ids_from_html(html_text):
    """Extract image IDs/URLs from HTML response."""
    image_ids = set()
    image_urls = set()

    # Pattern 1: listImg&q=XXXX or listImg&q=XXXX patterns
    for m in re.finditer(r'listImg[&?]q=([A-Za-z0-9_\-\.]+)', html_text):
        image_ids.add(m.group(1))

    # Pattern 2: uploadfiles.htm?action=listImg&q=XXXX
    for m in re.finditer(r'uploadfiles\.htm\?action=listImg&q=([A-Za-z0-9_\-\.]+)', html_text):
        image_ids.add(m.group(1))

    # Pattern 3: src="..." with image extensions
    for m in re.finditer(r'src=["\']([^"\']*?(?:\.jpg|\.jpeg|\.png|\.gif|\.bmp|\.webp)[^"\']*)["\']', html_text, re.IGNORECASE):
        image_urls.add(m.group(1))

    # Pattern 4: Any URL-like pattern with image extensions
    for m in re.finditer(r'(https?://[^\s"\'<>]+?(?:\.jpg|\.jpeg|\.png|\.gif|\.bmp|\.webp))', html_text, re.IGNORECASE):
        image_urls.add(m.group(1))

    # Pattern 5: background-image or style with url()
    for m in re.finditer(r'url\(["\']?([^"\')\s]+)["\']?\)', html_text):
        url = m.group(1)
        if any(ext in url.lower() for ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', 'listImg']):
            image_urls.add(url)

    # Pattern 6: data attributes with image references
    for m in re.finditer(r'data-[a-z]+=["\']([^"\']*(?:listImg|\.jpg|\.jpeg|\.png)[^"\']*)["\']', html_text, re.IGNORECASE):
        image_urls.add(m.group(1))

    # Pattern 7: Any uploadfiles reference
    for m in re.finditer(r'(uploadfiles[^"\'<>\s]*)', html_text):
        image_urls.add(m.group(1))

    # Pattern 8: Image file references (relative paths)
    for m in re.finditer(r'["\']([^"\']*?/(?:images?|photos?|pics?|uploads?)/[^"\']+)["\']', html_text, re.IGNORECASE):
        image_urls.add(m.group(1))

    return image_ids, image_urls


def extract_image_ids_from_json(json_data):
    """Extract image IDs/URLs from JSON response."""
    image_ids = set()
    image_urls = set()

    def _recurse(obj):
        if isinstance(obj, dict):
            for k, v in obj.items():
                if isinstance(v, str):
                    # Check if value looks like an image ID or URL
                    if 'listImg' in v or 'image' in k.lower() or 'photo' in k.lower() or 'img' in k.lower() or 'pic' in k.lower():
                        if 'listImg' in v:
                            for m in re.finditer(r'listImg[&?]q=([A-Za-z0-9_\-\.]+)', v):
                                image_ids.add(m.group(1))
                        if v.startswith('http') or v.startswith('/'):
                            image_urls.add(v)
                        elif re.match(r'^[A-Za-z0-9_\-\.]+$', v) and ('image' in k.lower() or 'photo' in k.lower() or 'img' in k.lower()):
                            image_ids.add(v)
                    # Check for URLs with image extensions
                    if any(ext in v.lower() for ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp']):
                        image_urls.add(v)
                elif isinstance(v, (dict, list)):
                    _recurse(v)
        elif isinstance(obj, list):
            for item in obj:
                _recurse(item)

    _recurse(json_data)
    return image_ids, image_urls


def query_load_image(build_id, navi_key=None):
    """Query the loadImage endpoint for a building."""
    params = {
        "action": "loadImage",
        "buildId": build_id,
    }
    if navi_key:
        params["naviKey"] = navi_key

    all_image_ids = set()
    all_image_urls = set()
    raw_text = None

    # Try GET first
    try:
        resp = SESSION.get(LOAD_IMAGE_URL, params=params, timeout=30)
        raw_text = resp.text
        resp.raise_for_status()

        # Try to parse as JSON first
        try:
            json_data = resp.json()
            ids, urls = extract_image_ids_from_json(json_data)
            all_image_ids.update(ids)
            all_image_urls.update(urls)
        except (json.JSONDecodeError, ValueError):
            pass

        # Also parse as HTML (might have embedded image references)
        ids, urls = extract_image_ids_from_html(resp.text)
        all_image_ids.update(ids)
        all_image_urls.update(urls)

    except requests.RequestException as e:
        print(f"    GET failed for {build_id} (naviKey={navi_key}): {e}")

    # Try POST if GET didn't yield results
    if not all_image_ids and not all_image_urls:
        try:
            post_data = {
                "action": "loadImage",
                "buildId": build_id,
            }
            if navi_key:
                post_data["naviKey"] = navi_key

            resp = SESSION.post(LOAD_IMAGE_URL, data=post_data, timeout=30)
            if raw_text is None:
                raw_text = resp.text
            else:
                raw_text += "\n\n--- POST RESPONSE ---\n\n" + resp.text
            resp.raise_for_status()

            try:
                json_data = resp.json()
                ids, urls = extract_image_ids_from_json(json_data)
                all_image_ids.update(ids)
                all_image_urls.update(urls)
            except (json.JSONDecodeError, ValueError):
                pass

            ids, urls = extract_image_ids_from_html(resp.text)
            all_image_ids.update(ids)
            all_image_urls.update(urls)

        except requests.RequestException as e:
            print(f"    POST failed for {build_id} (naviKey={navi_key}): {e}")

    return all_image_ids, all_image_urls, raw_text


def download_image_by_id(image_id, save_dir, build_id):
    """Download an image using the listImg endpoint."""
    url = f"{LIST_IMG_URL}?action=listImg&q={image_id}"
    try:
        resp = SESSION.get(url, timeout=30, stream=True)
        resp.raise_for_status()

        # Determine file extension from content type
        content_type = resp.headers.get('Content-Type', '')
        ext_map = {
            'image/jpeg': '.jpg',
            'image/png': '.png',
            'image/gif': '.gif',
            'image/bmp': '.bmp',
            'image/webp': '.webp',
        }
        ext = '.jpg'  # default
        for ct, e in ext_map.items():
            if ct in content_type:
                ext = e
                break

        # If content is not an image, it might be HTML/JSON with redirect
        if 'text/html' in content_type or 'application/json' in content_type:
            text = resp.text
            # Try to find actual image URL in response
            urls = re.findall(r'(https?://[^\s"\'<>]+?(?:\.jpg|\.jpeg|\.png|\.gif|\.webp))', text, re.IGNORECASE)
            if urls:
                # Download the actual image
                for actual_url in urls:
                    return download_image_by_url(actual_url, save_dir, build_id, image_id)
            # Save the text response for debugging
            debug_file = save_dir / f"{build_id}_{image_id}_response.txt"
            debug_file.write_text(text, encoding='utf-8')
            print(f"    [DEBUG] listImg returned text for {image_id}, saved to {debug_file.name}")
            return None

        # Save image
        filename = f"{build_id}_{image_id}{ext}"
        filepath = save_dir / filename

        # Check minimum file size (at least 100 bytes for a real image)
        content = resp.content
        if len(content) < 100:
            print(f"    [SKIP] {image_id}: response too small ({len(content)} bytes)")
            return None

        filepath.write_bytes(content)
        print(f"    [OK] Downloaded {filename} ({len(content)} bytes)")
        return filename

    except requests.RequestException as e:
        print(f"    [ERR] Failed to download image {image_id}: {e}")
        return None


def download_image_by_url(url, save_dir, build_id, tag=""):
    """Download an image from a direct URL."""
    try:
        # Make URL absolute if relative
        if url.startswith('/'):
            url = f"https://ymspace.ga.nycu.edu.tw{url}"
        elif not url.startswith('http'):
            url = f"{BASE_URL}/{url}"

        resp = SESSION.get(url, timeout=30, stream=True)
        resp.raise_for_status()

        content_type = resp.headers.get('Content-Type', '')

        # Skip non-image responses
        if 'text/html' in content_type and 'image' not in content_type:
            return None

        # Determine extension
        parsed = urlparse(url)
        path_ext = Path(parsed.path).suffix.lower()
        if path_ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']:
            ext = path_ext
        else:
            ext_map = {
                'image/jpeg': '.jpg',
                'image/png': '.png',
                'image/gif': '.gif',
                'image/bmp': '.bmp',
                'image/webp': '.webp',
            }
            ext = '.jpg'
            for ct, e in ext_map.items():
                if ct in content_type:
                    ext = e
                    break

        # Generate filename
        url_hash = str(abs(hash(url)))[:8]
        tag_part = f"_{tag}" if tag else ""
        filename = f"{build_id}{tag_part}_{url_hash}{ext}"
        filepath = save_dir / filename

        content = resp.content
        if len(content) < 100:
            return None

        filepath.write_bytes(content)
        print(f"    [OK] Downloaded {filename} ({len(content)} bytes)")
        return filename

    except requests.RequestException as e:
        print(f"    [ERR] Failed to download {url}: {e}")
        return None


def process_building(build_id):
    """Process a single building: query all floors and download images."""
    print(f"\n{'='*60}")
    print(f"Processing building: {build_id}")
    print(f"{'='*60}")

    build_dir = IMAGES_DIR / build_id
    build_dir.mkdir(parents=True, exist_ok=True)

    all_image_ids = set()
    all_image_urls = set()
    all_raw_responses = {}
    downloaded_files = []

    # Step 1: Query without naviKey (building-level images)
    print(f"  Querying building-level images...")
    ids, urls, raw = query_load_image(build_id)
    all_image_ids.update(ids)
    all_image_urls.update(urls)
    if raw:
        all_raw_responses[f"{build_id}_base"] = raw

    # Step 2: Query with floor naviKeys
    floors = BUILDING_FLOORS.get(build_id, DEFAULT_FLOORS)
    for floor in floors:
        print(f"  Querying floor {floor}...")
        ids, urls, raw = query_load_image(build_id, navi_key=floor)
        all_image_ids.update(ids)
        all_image_urls.update(urls)
        if raw:
            all_raw_responses[f"{build_id}_{floor}"] = raw
        time.sleep(0.3)  # Be polite to the server

    # Step 3: Save raw responses
    for key, text in all_raw_responses.items():
        raw_file = RAW_RESPONSES_DIR / f"{key}.txt"
        raw_file.write_text(text, encoding='utf-8')

    # Step 4: Download images by ID
    print(f"  Found {len(all_image_ids)} image IDs, {len(all_image_urls)} image URLs")

    for img_id in sorted(all_image_ids):
        fname = download_image_by_id(img_id, build_dir, build_id)
        if fname:
            downloaded_files.append(fname)
        time.sleep(0.3)

    # Step 5: Download images by URL
    seen_urls = set()
    for img_url in sorted(all_image_urls):
        # Skip duplicates and non-image URLs
        if img_url in seen_urls:
            continue
        seen_urls.add(img_url)

        fname = download_image_by_url(img_url, build_dir, build_id)
        if fname:
            downloaded_files.append(fname)
        time.sleep(0.3)

    print(f"  Total downloaded for {build_id}: {len(downloaded_files)} files")
    return downloaded_files


def main():
    """Main entry point."""
    print("=" * 70)
    print("NYCU Yangming Campus GIS - Building Photos Downloader")
    print(f"Target: {len(BUILDING_IDS)} buildings")
    print(f"Output: {OUTPUT_DIR}")
    print("=" * 70)

    ensure_dirs()

    # First, let's test connectivity
    print("\nTesting connectivity...")
    try:
        resp = SESSION.get(f"{BASE_URL}/public/buildinfo.htm", timeout=15)
        print(f"  Server response: {resp.status_code}")
    except requests.RequestException as e:
        print(f"  WARNING: Could not reach server: {e}")
        print("  Continuing anyway...")

    summary = {}
    total_images = 0

    for i, build_id in enumerate(BUILDING_IDS, 1):
        print(f"\n[{i}/{len(BUILDING_IDS)}] ", end="")
        try:
            files = process_building(build_id)
            summary[build_id] = {
                "downloaded_files": files,
                "count": len(files),
            }
            total_images += len(files)
        except Exception as e:
            print(f"  ERROR processing {build_id}: {e}")
            summary[build_id] = {
                "downloaded_files": [],
                "count": 0,
                "error": str(e),
            }

        # Small delay between buildings
        time.sleep(0.5)

    # Save summary
    summary_data = {
        "total_buildings": len(BUILDING_IDS),
        "total_images_downloaded": total_images,
        "buildings_with_images": sum(1 for v in summary.values() if v["count"] > 0),
        "buildings": summary,
    }

    with open(SUMMARY_FILE, 'w', encoding='utf-8') as f:
        json.dump(summary_data, f, ensure_ascii=False, indent=2)

    print("\n" + "=" * 70)
    print("DOWNLOAD COMPLETE")
    print(f"  Total buildings processed: {len(BUILDING_IDS)}")
    print(f"  Buildings with images: {summary_data['buildings_with_images']}")
    print(f"  Total images downloaded: {total_images}")
    print(f"  Summary saved to: {SUMMARY_FILE}")
    print("=" * 70)


if __name__ == "__main__":
    main()
