from pathlib import Path from datetime import datetime def parse_timestamp(entry): ts = entry.get('timestamp') if ts: try: return datetime.fromisoformat(ts) except Exception: return None return None def merge_applications(local_path, merge_path, output_path=None): """ Merge two applications.json files, deduplicate by listing_id. If duplicate, keep entry with more fields, or latest timestamp. """ output_path = output_path or local_path with open(local_path, encoding='utf-8') as f: local = json.load(f) with open(merge_path, encoding='utf-8') as f: remote = json.load(f) merged = {} all_keys = set(local.keys()) | set(remote.keys()) for key in all_keys: l_entry = local.get(key) r_entry = remote.get(key) if l_entry and r_entry: # Prefer entry with more fields if len(l_entry) > len(r_entry): merged[key] = l_entry elif len(r_entry) > len(l_entry): merged[key] = r_entry else: # If same length, prefer latest timestamp l_ts = parse_timestamp(l_entry) r_ts = parse_timestamp(r_entry) if l_ts and r_ts: merged[key] = l_entry if l_ts > r_ts else r_entry else: merged[key] = l_entry # fallback else: merged[key] = l_entry or r_entry with open(output_path, 'w', encoding='utf-8') as f: json.dump(merged, f, ensure_ascii=False, indent=2) print(f"Merged {len(merged)} unique applications to {output_path}") if __name__ == "__main__": merge_applications( "data/applications.json", "data/to_merge/applications.json" )