From deb074f074a47613844cc25c48eaeedb028bc975 Mon Sep 17 00:00:00 2001 From: Aron Date: Wed, 31 Dec 2025 16:47:03 +0100 Subject: [PATCH] after merge --- helper_functions/merge_applications.py | 53 +++++++++++++++++++++++ helper_functions/merge_dict_json.py | 50 +++++++++++++++++++++ helper_functions/merge_listing_times.py | 34 +++++++++++++++ helper_functions/merge_wgcompany_times.py | 34 +++++++++++++++ 4 files changed, 171 insertions(+) create mode 100644 helper_functions/merge_applications.py create mode 100644 helper_functions/merge_dict_json.py create mode 100644 helper_functions/merge_listing_times.py create mode 100644 helper_functions/merge_wgcompany_times.py diff --git a/helper_functions/merge_applications.py b/helper_functions/merge_applications.py new file mode 100644 index 0000000..e23c6b0 --- /dev/null +++ b/helper_functions/merge_applications.py @@ -0,0 +1,53 @@ +import json +from pathlib import Path +from datetime import datetime + +def parse_timestamp(entry): + ts = entry.get('timestamp') + if ts: + try: + return datetime.fromisoformat(ts) + except Exception: + return None + return None + +def merge_applications(local_path, merge_path, output_path=None): + """ + Merge two applications.json files, deduplicate by listing_id. + If duplicate, keep entry with more fields, or latest timestamp. + """ + output_path = output_path or local_path + with open(local_path, encoding='utf-8') as f: + local = json.load(f) + with open(merge_path, encoding='utf-8') as f: + remote = json.load(f) + merged = {} + all_keys = set(local.keys()) | set(remote.keys()) + for key in all_keys: + l_entry = local.get(key) + r_entry = remote.get(key) + if l_entry and r_entry: + # Prefer entry with more fields + if len(l_entry) > len(r_entry): + merged[key] = l_entry + elif len(r_entry) > len(l_entry): + merged[key] = r_entry + else: + # If same length, prefer latest timestamp + l_ts = parse_timestamp(l_entry) + r_ts = parse_timestamp(r_entry) + if l_ts and r_ts: + merged[key] = l_entry if l_ts > r_ts else r_entry + else: + merged[key] = l_entry # fallback + else: + merged[key] = l_entry or r_entry + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(merged, f, ensure_ascii=False, indent=2) + print(f"Merged {len(merged)} unique applications to {output_path}") + +if __name__ == "__main__": + merge_applications( + "data/applications.json", + "data/to_merge/applications.json" + ) diff --git a/helper_functions/merge_dict_json.py b/helper_functions/merge_dict_json.py new file mode 100644 index 0000000..4fa8950 --- /dev/null +++ b/helper_functions/merge_dict_json.py @@ -0,0 +1,50 @@ +import json +from pathlib import Path +from datetime import datetime + +def parse_timestamp(entry): + ts = entry.get('fetched_at') + if ts: + try: + return datetime.fromisoformat(ts) + except Exception: + return None + return None + +def merge_dict_json(local_path, merge_path, output_path=None, timestamp_field='fetched_at'): + """ + Merge two dict-based JSON files (keyed by id), deduplicate by key. + If duplicate, keep entry with latest timestamp_field. + """ + output_path = output_path or local_path + with open(local_path, encoding='utf-8') as f: + local = json.load(f) + with open(merge_path, encoding='utf-8') as f: + remote = json.load(f) + merged = {} + all_keys = set(local.keys()) | set(remote.keys()) + for key in all_keys: + l_entry = local.get(key) + r_entry = remote.get(key) + if l_entry and r_entry: + l_ts = l_entry.get(timestamp_field) + r_ts = r_entry.get(timestamp_field) + if l_ts and r_ts: + merged[key] = l_entry if l_ts > r_ts else r_entry + else: + merged[key] = l_entry # fallback + else: + merged[key] = l_entry or r_entry + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(merged, f, ensure_ascii=False, indent=2) + print(f"Merged {len(merged)} unique entries to {output_path}") + +if __name__ == "__main__": + merge_dict_json( + "data/listings.json", + "data/to_merge/listings.json" + ) + merge_dict_json( + "data/wgcompany_listings.json", + "data/to_merge/wgcompany_listings.json" + ) diff --git a/helper_functions/merge_listing_times.py b/helper_functions/merge_listing_times.py new file mode 100644 index 0000000..1e4194f --- /dev/null +++ b/helper_functions/merge_listing_times.py @@ -0,0 +1,34 @@ +import csv +from pathlib import Path + +def merge_listing_times(local_path, merge_path, output_path=None): + """ + Merge two listing_times.csv files, deduplicate by listing_id and timestamp. + local_path: main data/listing_times.csv + merge_path: data/to_merge/listing_times.csv + output_path: where to write merged file (default: overwrite local_path) + """ + output_path = output_path or local_path + seen = set() + rows = [] + # Read both files + for path in [local_path, merge_path]: + with open(path, newline='', encoding='utf-8') as f: + reader = csv.DictReader(f) + for row in reader: + key = (row['listing_id'], row['timestamp']) + if key not in seen: + seen.add(key) + rows.append(row) + # Write merged file + with open(output_path, 'w', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=rows[0].keys()) + writer.writeheader() + writer.writerows(rows) + print(f"Merged {len(rows)} unique rows to {output_path}") + +if __name__ == "__main__": + merge_listing_times( + "data/listing_times.csv", + "data/to_merge/listing_times.csv" + ) diff --git a/helper_functions/merge_wgcompany_times.py b/helper_functions/merge_wgcompany_times.py new file mode 100644 index 0000000..0c1aa74 --- /dev/null +++ b/helper_functions/merge_wgcompany_times.py @@ -0,0 +1,34 @@ +import csv +from pathlib import Path + +def merge_wgcompany_times(local_path, merge_path, output_path=None): + """ + Merge two wgcompany_times.csv files, deduplicate by listing_id and timestamp. + local_path: main data/wgcompany_times.csv + merge_path: data/to_merge/wgcompany_times.csv + output_path: where to write merged file (default: overwrite local_path) + """ + output_path = output_path or local_path + seen = set() + rows = [] + # Read both files + for path in [local_path, merge_path]: + with open(path, newline='', encoding='utf-8') as f: + reader = csv.DictReader(f) + for row in reader: + key = (row['listing_id'], row['timestamp']) + if key not in seen: + seen.add(key) + rows.append(row) + # Write merged file + with open(output_path, 'w', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=rows[0].keys()) + writer.writeheader() + writer.writerows(rows) + print(f"Merged {len(rows)} unique rows to {output_path}") + +if __name__ == "__main__": + merge_wgcompany_times( + "data/wgcompany_times.csv", + "data/to_merge/wgcompany_times.csv" + )