import json from pathlib import Path from datetime import datetime def parse_timestamp(entry): ts = entry.get('fetched_at') if ts: try: return datetime.fromisoformat(ts) except Exception: return None return None def merge_dict_json(local_path, merge_path, output_path=None, timestamp_field='fetched_at'): """ Merge two dict-based JSON files (keyed by id), deduplicate by key. If duplicate, keep entry with latest timestamp_field. """ output_path = output_path or local_path with open(local_path, encoding='utf-8') as f: local = json.load(f) with open(merge_path, encoding='utf-8') as f: remote = json.load(f) merged = {} all_keys = set(local.keys()) | set(remote.keys()) for key in all_keys: l_entry = local.get(key) r_entry = remote.get(key) if l_entry and r_entry: l_ts = l_entry.get(timestamp_field) r_ts = r_entry.get(timestamp_field) if l_ts and r_ts: merged[key] = l_entry if l_ts > r_ts else r_entry else: merged[key] = l_entry # fallback else: merged[key] = l_entry or r_entry with open(output_path, 'w', encoding='utf-8') as f: json.dump(merged, f, ensure_ascii=False, indent=2) print(f"Merged {len(merged)} unique entries to {output_path}") if __name__ == "__main__": merge_dict_json( "data/listings.json", "data/to_merge/listings.json" ) merge_dict_json( "data/wgcompany_listings.json", "data/to_merge/wgcompany_listings.json" )