"""
WFS CQL_FILTER Advanced Query Explorer
Systematically queries GeoServer WFS with CQL_FILTER and advanced parameters
to extract maximum data from all gis: layers.
"""

import json
import os
import sys
import time
import urllib.parse
from datetime import datetime

import requests
import urllib3

# Suppress SSL warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Fix encoding
sys.stdout.reconfigure(encoding="utf-8")

BASE_URL = "https://ymspace.ga.nycu.edu.tw:8080/geoserver/wfs"
OUTPUT_DIR = r"C:\Users\thc1006\Desktop\NQSD\新增資料夾\data\ymmap_archive\wfs_cql_queries"
EXISTING_DIR = r"C:\Users\thc1006\Desktop\NQSD\新增資料夾\data\ymmap_archive\wfs_all_gis_layers"

# Key layers to query with CQL_FILTER
KEY_LAYERS = [
    "gis:gis_building_geom",
    "gis:gis_building",
    "gis:gis_campus",
    "gis:gis_parking",
    "gis:gis_busstop",
    "gis:gis_restaurant",
    "gis:gis_aed",
    "gis:gis_artwork",
    "gis:gis_atm",
    "gis:gis_auditorium",
    "gis:gis_auditorium2",
    "gis:gis_barrierfreetoilet",
    "gis:gis_barrierfreetoilet2",
    "gis:gis_block",
    "gis:gis_busroutes_1",
    "gis:gis_busroutes_2",
    "gis:gis_busroutes_3",
    "gis:gis_campusphotos",
    "gis:gis_conveniencestore",
    "gis:gis_elevator",
    "gis:gis_elevator2",
    "gis:gis_emergencycall",
    "gis:gis_emergencycall2",
    "gis:gis_firestation",
    "gis:gis_gateway",
    "gis:gis_handicapparking",
    "gis:gis_handicapparking2",
    "gis:gis_healthroom",
    "gis:gis_monthlyparking",
    "gis:gis_postoffice",
    "gis:gis_securityoffice",
    "gis:gis_sport",
    "gis:gis_sport_p",
    "gis:gis_telephonebooth",
    "gis:gis_timerecorder",
    "gis:gis_wheelchairramp",
    "gis:gis_wheelchairramp2",
    "gis:gis_sidewalk_1",
    "gis:gis_sidewalk_2",
    "gis:gis_sidewalk_3",
    "gis:gis_sidewalk_4",
    "gis:gis_sidewalk_5",
    "gis:gis_sidewalk_6",
    "gis:gis_sidewalk_7",
    "gis:gis_sidewalk_8",
    "gis:gis_sidewalk_9",
    "gis:gis_sidewalk_10",
    "gis:gis_sidewalk_11",
    "gis:gis_sidewalk_12",
    "gis:gis_sidewalk_13",
    "gis:gis_sidewalk_14",
    "gis:gis_sidewalk_15",
]


def wfs_request(params, timeout=30):
    """Make a WFS request with given parameters."""
    default_params = {
        "service": "WFS",
        "version": "1.1.0",
    }
    default_params.update(params)
    try:
        resp = requests.get(
            BASE_URL, params=default_params, verify=False, timeout=timeout
        )
        return resp
    except Exception as e:
        print(f"  [ERROR] Request failed: {e}")
        return None


def get_feature_count(type_name):
    """Get the number of features for a layer using resultType=hits."""
    resp = wfs_request(
        {
            "request": "GetFeature",
            "typeName": type_name,
            "resultType": "hits",
        }
    )
    if resp and resp.status_code == 200:
        text = resp.text
        # Parse numberOfFeatures from XML
        import re

        match = re.search(r'numberOfFeatures="(\d+)"', text)
        if match:
            return int(match.group(1))
    return None


def get_features_json(type_name, cql_filter=None, max_features=None, start_index=None, sort_by=None, property_name=None):
    """Get features as GeoJSON with optional CQL_FILTER and pagination."""
    params = {
        "request": "GetFeature",
        "typeName": type_name,
        "outputFormat": "application/json",
    }
    if cql_filter:
        params["CQL_FILTER"] = cql_filter
    if max_features:
        params["maxFeatures"] = str(max_features)
    if start_index is not None:
        params["startIndex"] = str(start_index)
    if sort_by:
        params["sortBy"] = sort_by
    if property_name:
        params["propertyName"] = property_name

    resp = wfs_request(params, timeout=60)
    if resp and resp.status_code == 200:
        try:
            return resp.json()
        except json.JSONDecodeError:
            print(f"  [WARN] Non-JSON response for {type_name}")
            return {"error": "non-json", "text": resp.text[:500]}
    elif resp:
        return {"error": f"HTTP {resp.status_code}", "text": resp.text[:500]}
    return None


def describe_feature_type(type_name):
    """Get the schema/properties of a layer using DescribeFeatureType."""
    resp = wfs_request(
        {
            "request": "DescribeFeatureType",
            "typeName": type_name,
            "outputFormat": "application/json",
        }
    )
    if resp and resp.status_code == 200:
        try:
            return resp.json()
        except json.JSONDecodeError:
            # Try XML
            return {"raw_xml": resp.text[:2000]}
    return None


def count_existing_features(layer_name):
    """Count features in existing downloaded files."""
    # Convert layer name to filename pattern
    fname = layer_name.replace(":", "_") + ".json"
    fpath = os.path.join(EXISTING_DIR, fname)
    if os.path.exists(fpath):
        try:
            with open(fpath, "r", encoding="utf-8") as f:
                data = json.load(f)
            if "features" in data:
                return len(data["features"])
            elif "numberReturned" in data:
                return data["numberReturned"]
        except Exception:
            pass
    return None


def explore_layer_cql(layer_name):
    """Comprehensive exploration of a single layer with CQL_FILTER."""
    print(f"\n{'='*70}")
    print(f"Exploring: {layer_name}")
    print(f"{'='*70}")

    result = {
        "layer": layer_name,
        "timestamp": datetime.now().isoformat(),
        "queries": {},
    }

    # 1. Get feature count (baseline)
    print("  [1] Getting feature count (resultType=hits)...")
    count = get_feature_count(layer_name)
    result["total_feature_count"] = count
    print(f"      Total features: {count}")

    # 2. Get schema via DescribeFeatureType
    print("  [2] Getting schema (DescribeFeatureType)...")
    schema = describe_feature_type(layer_name)
    result["schema"] = schema
    if schema and "featureTypes" in schema:
        props = schema["featureTypes"][0].get("properties", [])
        prop_names = [p["name"] for p in props]
        print(f"      Properties: {prop_names}")
        result["property_names"] = prop_names
    else:
        prop_names = []

    # 3. CQL_FILTER=1=1 (get ALL features)
    print("  [3] CQL_FILTER=1=1 (all features)...")
    data_all = get_features_json(layer_name, cql_filter="1=1")
    if data_all and "features" in data_all:
        n = len(data_all["features"])
        print(f"      Features returned: {n}")
        result["queries"]["cql_1eq1"] = {
            "filter": "1=1",
            "feature_count": n,
        }
        # Save full data
        safe_name = layer_name.replace(":", "_")
        fpath = os.path.join(OUTPUT_DIR, f"{safe_name}_cql_all.json")
        with open(fpath, "w", encoding="utf-8") as f:
            json.dump(data_all, f, ensure_ascii=False, indent=2)
        print(f"      Saved to: {fpath}")
    elif data_all:
        result["queries"]["cql_1eq1"] = {"error": str(data_all.get("error", "unknown"))}

    # 4. No filter (default GetFeature) for comparison
    print("  [4] Default GetFeature (no filter)...")
    data_default = get_features_json(layer_name)
    if data_default and "features" in data_default:
        n = len(data_default["features"])
        print(f"      Features returned: {n}")
        result["queries"]["no_filter"] = {"feature_count": n}
    elif data_default:
        result["queries"]["no_filter"] = {"error": str(data_default.get("error", "unknown"))}

    # 5. Compare with existing data
    existing_count = count_existing_features(layer_name)
    result["existing_file_count"] = existing_count
    if existing_count is not None:
        print(f"  [5] Existing file has: {existing_count} features")
    else:
        print(f"  [5] No existing file found")

    # 6. Try pagination (if large dataset)
    if count and count > 100:
        print(f"  [6] Trying pagination (maxFeatures=50, startIndex=0)...")
        data_page = get_features_json(layer_name, max_features=50, start_index=0)
        if data_page and "features" in data_page:
            print(f"      Page 1 (0-49): {len(data_page['features'])} features")
            result["queries"]["pagination_page1"] = {"feature_count": len(data_page["features"])}

        data_page2 = get_features_json(layer_name, max_features=50, start_index=50)
        if data_page2 and "features" in data_page2:
            print(f"      Page 2 (50-99): {len(data_page2['features'])} features")
            result["queries"]["pagination_page2"] = {"feature_count": len(data_page2["features"])}

    # 7. Try various CQL filters on text fields
    if prop_names:
        # Try LIKE filters on string properties
        for pname in prop_names[:5]:  # Only first 5 to avoid too many requests
            if pname in ("the_geom", "geom", "geometry"):
                continue
            print(f"  [7] Trying CQL_FILTER: {pname} IS NOT NULL")
            data_notnull = get_features_json(
                layer_name, cql_filter=f"{pname} IS NOT NULL", max_features=5
            )
            if data_notnull and "features" in data_notnull:
                n = len(data_notnull["features"])
                print(f"      {pname} IS NOT NULL: {n} features (sample)")
                if n > 0:
                    # Show sample values
                    sample_vals = []
                    for feat in data_notnull["features"][:3]:
                        val = feat.get("properties", {}).get(pname)
                        if val is not None:
                            sample_vals.append(str(val)[:100])
                    if sample_vals:
                        print(f"      Sample values: {sample_vals}")
                        result["queries"][f"notnull_{pname}"] = {
                            "filter": f"{pname} IS NOT NULL",
                            "sample_count": n,
                            "sample_values": sample_vals,
                        }
            time.sleep(0.3)

    return result


def explore_layer_full_download(layer_name, max_features_per_page=500):
    """Download ALL features using pagination for large layers."""
    count = get_feature_count(layer_name)
    if not count:
        return None

    print(f"\n  Full download of {layer_name}: {count} features")
    all_features = []
    offset = 0

    while offset < count:
        data = get_features_json(
            layer_name,
            cql_filter="1=1",
            max_features=max_features_per_page,
            start_index=offset,
        )
        if data and "features" in data:
            batch = data["features"]
            all_features.extend(batch)
            print(f"    Downloaded {offset}-{offset+len(batch)-1} ({len(batch)} features)")
            if len(batch) < max_features_per_page:
                break
            offset += max_features_per_page
        else:
            print(f"    Failed at offset {offset}")
            break
        time.sleep(0.5)

    if all_features:
        # Build a complete GeoJSON
        result = {
            "type": "FeatureCollection",
            "totalFeatures": count,
            "features": all_features,
            "numberReturned": len(all_features),
        }
        safe_name = layer_name.replace(":", "_")
        fpath = os.path.join(OUTPUT_DIR, f"{safe_name}_COMPLETE.json")
        with open(fpath, "w", encoding="utf-8") as f:
            json.dump(result, f, ensure_ascii=False, indent=2)
        print(f"    Saved COMPLETE: {fpath} ({len(all_features)} features)")
        return len(all_features)
    return 0


def main():
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    print("=" * 70)
    print("WFS CQL_FILTER Advanced Query Explorer")
    print(f"Target: {BASE_URL}")
    print(f"Output: {OUTPUT_DIR}")
    print(f"Started: {datetime.now().isoformat()}")
    print("=" * 70)

    # Phase 1: Get capabilities to discover all layers
    print("\n--- Phase 1: GetCapabilities ---")
    resp = wfs_request({"request": "GetCapabilities"})
    if resp and resp.status_code == 200:
        caps_path = os.path.join(OUTPUT_DIR, "capabilities.xml")
        with open(caps_path, "w", encoding="utf-8") as f:
            f.write(resp.text)
        print(f"  Saved capabilities to {caps_path}")

        # Extract layer names from capabilities
        import re
        layer_names = re.findall(r"<Name>(gis:[^<]+)</Name>", resp.text)
        layer_names = list(dict.fromkeys(layer_names))  # deduplicate preserving order
        print(f"  Found {len(layer_names)} gis: layers in capabilities")

        # Check for layers not in our KEY_LAYERS list
        key_set = set(KEY_LAYERS)
        new_layers = [l for l in layer_names if l not in key_set]
        if new_layers:
            print(f"  NEW layers not in KEY_LAYERS: {new_layers}")
    else:
        print("  Failed to get capabilities")
        layer_names = KEY_LAYERS

    # Phase 2: Explore all key layers
    print("\n--- Phase 2: CQL_FILTER exploration of key layers ---")
    all_results = {}
    priority_layers = [
        "gis:gis_building_geom",
        "gis:gis_building",
        "gis:gis_campus",
        "gis:gis_parking",
        "gis:gis_busstop",
        "gis:gis_restaurant",
        "gis:gis_aed",
        "gis:gis_artwork",
        "gis:gis_atm",
        "gis:gis_auditorium",
        "gis:gis_auditorium2",
        "gis:gis_barrierfreetoilet",
        "gis:gis_barrierfreetoilet2",
        "gis:gis_campusphotos",
        "gis:gis_conveniencestore",
        "gis:gis_elevator",
        "gis:gis_elevator2",
        "gis:gis_emergencycall",
        "gis:gis_emergencycall2",
        "gis:gis_firestation",
        "gis:gis_gateway",
        "gis:gis_handicapparking",
        "gis:gis_handicapparking2",
        "gis:gis_healthroom",
        "gis:gis_monthlyparking",
        "gis:gis_postoffice",
        "gis:gis_securityoffice",
        "gis:gis_sport",
        "gis:gis_sport_p",
        "gis:gis_telephonebooth",
        "gis:gis_timerecorder",
        "gis:gis_wheelchairramp",
        "gis:gis_wheelchairramp2",
        "gis:gis_block",
    ]

    # Add any newly discovered layers
    for l in layer_names:
        if l not in priority_layers and "sidewalk" not in l and "parcel" not in l:
            priority_layers.append(l)

    for layer in priority_layers:
        try:
            result = explore_layer_cql(layer)
            all_results[layer] = result
            time.sleep(0.5)
        except Exception as e:
            print(f"  [ERROR] Failed to explore {layer}: {e}")
            all_results[layer] = {"error": str(e)}

    # Phase 3: Full download for layers with potential hidden data
    print("\n--- Phase 3: Full download for discrepant layers ---")
    discrepant = []
    for layer, info in all_results.items():
        if isinstance(info, dict) and "error" not in info:
            total = info.get("total_feature_count")
            existing = info.get("existing_file_count")
            cql_count = info.get("queries", {}).get("cql_1eq1", {}).get("feature_count")

            if total and existing and total > existing:
                discrepant.append((layer, total, existing, "total > existing"))
            elif total and cql_count and total > cql_count:
                discrepant.append((layer, total, cql_count, "total > cql_returned"))

    if discrepant:
        print(f"  Found {len(discrepant)} layers with potential hidden data:")
        for layer, total, got, reason in discrepant:
            print(f"    {layer}: total={total}, got={got} ({reason})")
            explore_layer_full_download(layer)
    else:
        print("  No discrepancies found - all data appears complete")

    # Phase 4: Try some creative CQL filters
    print("\n--- Phase 4: Creative CQL filters ---")

    # Try BBOX filter for different campus areas
    campus_bboxes = {
        "yangming": "121.5100,25.1200,121.5250,25.1350",  # Yangming campus area
        "guangfu": "120.9900,24.7800,121.0100,24.8000",   # Guangfu campus area
        "boai": "120.9700,24.7900,120.9800,24.8000",      # Bo'ai campus area
        "tainan": "120.2700,22.9900,120.2900,23.0100",    # Tainan campus area
    }

    for campus_name, bbox in campus_bboxes.items():
        print(f"\n  BBOX filter for {campus_name} campus: {bbox}")
        for layer in ["gis:gis_building_geom", "gis:gis_building", "gis:gis_restaurant", "gis:gis_parking"]:
            data = get_features_json(
                layer,
                cql_filter=f"BBOX(the_geom,{bbox})",
                max_features=200,
            )
            if data and "features" in data:
                n = len(data["features"])
                if n > 0:
                    print(f"    {layer}: {n} features in {campus_name}")
                    safe = f"{layer.replace(':', '_')}_bbox_{campus_name}.json"
                    fpath = os.path.join(OUTPUT_DIR, safe)
                    with open(fpath, "w", encoding="utf-8") as f:
                        json.dump(data, f, ensure_ascii=False, indent=2)
            time.sleep(0.3)

    # Phase 5: Generate summary
    print("\n--- Phase 5: Generating summary ---")
    summary = {
        "timestamp": datetime.now().isoformat(),
        "base_url": BASE_URL,
        "layers_explored": len(all_results),
        "layer_summaries": {},
        "discrepancies": [],
        "new_data_found": [],
    }

    for layer, info in all_results.items():
        if isinstance(info, dict) and "error" not in info:
            total = info.get("total_feature_count", 0)
            existing = info.get("existing_file_count")
            cql_count = info.get("queries", {}).get("cql_1eq1", {}).get("feature_count", 0)
            props = info.get("property_names", [])

            layer_summary = {
                "total_features": total,
                "cql_all_features": cql_count,
                "existing_file_features": existing,
                "properties": props,
                "property_count": len(props),
            }

            if total and existing and total != existing:
                diff = total - existing
                layer_summary["discrepancy"] = diff
                summary["discrepancies"].append({
                    "layer": layer,
                    "total": total,
                    "existing": existing,
                    "difference": diff,
                })

            if cql_count and (not existing or cql_count > existing):
                summary["new_data_found"].append({
                    "layer": layer,
                    "cql_count": cql_count,
                    "existing_count": existing or 0,
                    "new_features": cql_count - (existing or 0),
                })

            summary["layer_summaries"][layer] = layer_summary
        else:
            summary["layer_summaries"][layer] = {"error": str(info)}

    summary_path = os.path.join(OUTPUT_DIR, "CQL_EXPLORATION_SUMMARY.json")
    with open(summary_path, "w", encoding="utf-8") as f:
        json.dump(summary, f, ensure_ascii=False, indent=2)
    print(f"  Summary saved to: {summary_path}")

    # Save detailed results
    details_path = os.path.join(OUTPUT_DIR, "CQL_EXPLORATION_DETAILS.json")
    with open(details_path, "w", encoding="utf-8") as f:
        json.dump(all_results, f, ensure_ascii=False, indent=2, default=str)
    print(f"  Details saved to: {details_path}")

    # Print final summary
    print("\n" + "=" * 70)
    print("FINAL SUMMARY")
    print("=" * 70)
    print(f"Total layers explored: {len(all_results)}")
    print(f"Discrepancies found: {len(summary['discrepancies'])}")
    print(f"New data found in: {len(summary['new_data_found'])} layers")

    if summary["discrepancies"]:
        print("\nDiscrepant layers:")
        for d in summary["discrepancies"]:
            print(f"  {d['layer']}: total={d['total']}, existing={d['existing']}, diff={d['difference']}")

    if summary["new_data_found"]:
        print("\nLayers with new/additional data:")
        for d in summary["new_data_found"]:
            print(f"  {d['layer']}: cql={d['cql_count']}, existing={d['existing_count']}, new={d['new_features']}")

    # Print property summary for key layers
    print("\nProperty names for key layers:")
    for layer in ["gis:gis_building_geom", "gis:gis_building", "gis:gis_campus",
                   "gis:gis_parking", "gis:gis_busstop", "gis:gis_restaurant", "gis:gis_aed"]:
        if layer in all_results and "property_names" in all_results[layer]:
            print(f"  {layer}: {all_results[layer]['property_names']}")

    print(f"\nCompleted: {datetime.now().isoformat()}")


if __name__ == "__main__":
    main()
