#!/usr/bin/env python3
"""
Merge all data from prod and dev environments.
Handles applications.json, listings.json, wgcompany_listings.json, and CSV files.
For failed applications with duplicates, keeps the earlier timestamp.
"""
import json
import pandas as pd
from pathlib import Path
from datetime import datetime

def parse_timestamp(ts_str):
    """Parse ISO format timestamp string to datetime object."""
    if ts_str:
        try:
            return datetime.fromisoformat(ts_str)
        except Exception:
            return None
    return None

def merge_applications(local_path, merge_path, output_path=None):
    """
    Merge two applications.json files, deduplicate by listing_id.
    Special handling: For failed applications with duplicates, keep the earlier timestamp.
    For successful applications, keep the entry with more complete data.
    """
    output_path = output_path or local_path
    with open(local_path, encoding='utf-8') as f:
        local = json.load(f)
    with open(merge_path, encoding='utf-8') as f:
        remote = json.load(f)

    merged = {}
    all_keys = set(local.keys()) | set(remote.keys())

    for key in all_keys:
        l_entry = local.get(key)
        r_entry = remote.get(key)

        if l_entry and r_entry:
            # Both have this application
            l_success = l_entry.get('success', False)
            r_success = r_entry.get('success', False)
            l_ts = parse_timestamp(l_entry.get('timestamp'))
            r_ts = parse_timestamp(r_entry.get('timestamp'))

            # If both failed, keep the one with earlier timestamp (to avoid timestamp corruption bug)
            if not l_success and not r_success:
                if l_ts and r_ts:
                    merged[key] = l_entry if l_ts < r_ts else r_entry
                else:
                    merged[key] = l_entry  # fallback if timestamp missing
            # If one succeeded and one failed, keep the successful one
            elif l_success and not r_success:
                merged[key] = l_entry
            elif r_success and not l_success:
                merged[key] = r_entry
            # If both succeeded, prefer entry with more fields, or latest timestamp
            else:
                if len(l_entry) > len(r_entry):
                    merged[key] = l_entry
                elif len(r_entry) > len(l_entry):
                    merged[key] = r_entry
                else:
                    # Same length, prefer latest timestamp
                    if l_ts and r_ts:
                        merged[key] = l_entry if l_ts > r_ts else r_entry
                    else:
                        merged[key] = l_entry
        else:
            # Only one has this application
            merged[key] = l_entry or r_entry

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(merged, f, ensure_ascii=False, indent=2)

    print(f"✓ Merged applications: {len(merged)} unique entries → {output_path}")
    return merged

def merge_dict_json(local_path, merge_path, output_path=None, timestamp_field='fetched_at'):
    """
    Merge two dict-based JSON files (keyed by id), deduplicate by key.
    If duplicate, keep entry with latest timestamp_field.
    """
    output_path = output_path or local_path
    with open(local_path, encoding='utf-8') as f:
        local = json.load(f)
    with open(merge_path, encoding='utf-8') as f:
        remote = json.load(f)

    merged = {}
    all_keys = set(local.keys()) | set(remote.keys())

    for key in all_keys:
        l_entry = local.get(key)
        r_entry = remote.get(key)

        if l_entry and r_entry:
            l_ts = l_entry.get(timestamp_field)
            r_ts = r_entry.get(timestamp_field)
            if l_ts and r_ts:
                merged[key] = l_entry if l_ts > r_ts else r_entry
            else:
                merged[key] = l_entry
        else:
            merged[key] = l_entry or r_entry

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(merged, f, ensure_ascii=False, indent=2)

    print(f"✓ Merged {Path(local_path).name}: {len(merged)} unique entries → {output_path}")
    return merged

def merge_csv_times(local_path, merge_path, output_path=None):
    """
    Merge two CSV files with time-series data, deduplicate by all columns.
    Keeps unique rows based on all column values.
    """
    output_path = output_path or local_path

    local_df = pd.read_csv(local_path)
    remote_df = pd.read_csv(merge_path)

    # Combine and drop duplicates
    merged_df = pd.concat([local_df, remote_df], ignore_index=True)
    merged_df = merged_df.drop_duplicates()

    # Sort by timestamp if present
    if 'timestamp' in merged_df.columns:
        merged_df = merged_df.sort_values('timestamp')

    merged_df.to_csv(output_path, index=False)

    print(f"✓ Merged {Path(local_path).name}: {len(merged_df)} rows → {output_path}")
    return merged_df

def merge_all_data(local_base_dir="data", merge_base_dir="data/to_merge", output_base_dir=None):
    """
    Main function to merge all data from prod and dev environments.

    Args:
        local_base_dir: Base directory for local (dev) data
        merge_base_dir: Base directory for data to merge (prod)
        output_base_dir: Output directory (defaults to local_base_dir)

    Returns:
        dict: Summary of merge results
    """
    output_base_dir = output_base_dir or local_base_dir
    local_base = Path(local_base_dir)
    merge_base = Path(merge_base_dir)
    output_base = Path(output_base_dir)

    print("=" * 60)
    print("MERGING PROD AND DEV DATA")
    print("=" * 60)

    results = {}

    # 1. Merge applications.json (special handling for failed duplicates)
    if (local_base / "applications.json").exists() and (merge_base / "applications.json").exists():
        results['applications'] = merge_applications(
            str(local_base / "applications.json"),
            str(merge_base / "applications.json"),
            str(output_base / "applications.json")
        )

    # 2. Merge listings.json
    if (local_base / "listings.json").exists() and (merge_base / "listings.json").exists():
        results['listings'] = merge_dict_json(
            str(local_base / "listings.json"),
            str(merge_base / "listings.json"),
            str(output_base / "listings.json"),
            timestamp_field='fetched_at'
        )

    # 3. Merge wgcompany_listings.json
    if (local_base / "wgcompany_listings.json").exists() and (merge_base / "wgcompany_listings.json").exists():
        results['wgcompany_listings'] = merge_dict_json(
            str(local_base / "wgcompany_listings.json"),
            str(merge_base / "wgcompany_listings.json"),
            str(output_base / "wgcompany_listings.json"),
            timestamp_field='fetched_at'
        )

    # 4. Merge listing_times.csv
    if (local_base / "listing_times.csv").exists() and (merge_base / "listing_times.csv").exists():
        results['listing_times'] = merge_csv_times(
            str(local_base / "listing_times.csv"),
            str(merge_base / "listing_times.csv"),
            str(output_base / "listing_times.csv")
        )

    # 5. Merge wgcompany_times.csv
    if (local_base / "wgcompany_times.csv").exists() and (merge_base / "wgcompany_times.csv").exists():
        results['wgcompany_times'] = merge_csv_times(
            str(local_base / "wgcompany_times.csv"),
            str(merge_base / "wgcompany_times.csv"),
            str(output_base / "wgcompany_times.csv")
        )

    print("=" * 60)
    print("MERGE COMPLETE")
    print("=" * 60)

    return results

if __name__ == "__main__":
    # Usage: Place prod data in data/to_merge/ directory, then run this script
    merge_all_data()