From deb074f074a47613844cc25c48eaeedb028bc975 Mon Sep 17 00:00:00 2001
From: Aron <aron@petau.net>
Date: Wed, 31 Dec 2025 16:47:03 +0100
Subject: [PATCH] after merge

---
 helper_functions/merge_applications.py    | 53 +++++++++++++++++++++++
 helper_functions/merge_dict_json.py       | 50 +++++++++++++++++++++
 helper_functions/merge_listing_times.py   | 34 +++++++++++++++
 helper_functions/merge_wgcompany_times.py | 34 +++++++++++++++
 4 files changed, 171 insertions(+)
 create mode 100644 helper_functions/merge_applications.py
 create mode 100644 helper_functions/merge_dict_json.py
 create mode 100644 helper_functions/merge_listing_times.py
 create mode 100644 helper_functions/merge_wgcompany_times.py

diff --git a/helper_functions/merge_applications.py b/helper_functions/merge_applications.py
new file mode 100644
index 0000000..e23c6b0
--- /dev/null
+++ b/helper_functions/merge_applications.py
@@ -0,0 +1,53 @@
+import json
+from pathlib import Path
+from datetime import datetime
+
+def parse_timestamp(entry):
+    ts = entry.get('timestamp')
+    if ts:
+        try:
+            return datetime.fromisoformat(ts)
+        except Exception:
+            return None
+    return None
+
+def merge_applications(local_path, merge_path, output_path=None):
+    """
+    Merge two applications.json files, deduplicate by listing_id.
+    If duplicate, keep entry with more fields, or latest timestamp.
+    """
+    output_path = output_path or local_path
+    with open(local_path, encoding='utf-8') as f:
+        local = json.load(f)
+    with open(merge_path, encoding='utf-8') as f:
+        remote = json.load(f)
+    merged = {}
+    all_keys = set(local.keys()) | set(remote.keys())
+    for key in all_keys:
+        l_entry = local.get(key)
+        r_entry = remote.get(key)
+        if l_entry and r_entry:
+            # Prefer entry with more fields
+            if len(l_entry) > len(r_entry):
+                merged[key] = l_entry
+            elif len(r_entry) > len(l_entry):
+                merged[key] = r_entry
+            else:
+                # If same length, prefer latest timestamp
+                l_ts = parse_timestamp(l_entry)
+                r_ts = parse_timestamp(r_entry)
+                if l_ts and r_ts:
+                    merged[key] = l_entry if l_ts > r_ts else r_entry
+                else:
+                    merged[key] = l_entry  # fallback
+        else:
+            merged[key] = l_entry or r_entry
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(merged, f, ensure_ascii=False, indent=2)
+    print(f"Merged {len(merged)} unique applications to {output_path}")
+
+if __name__ == "__main__":
+    merge_applications(
+        "data/applications.json",
+        "data/to_merge/applications.json"
+    )
diff --git a/helper_functions/merge_dict_json.py b/helper_functions/merge_dict_json.py
new file mode 100644
index 0000000..4fa8950
--- /dev/null
+++ b/helper_functions/merge_dict_json.py
@@ -0,0 +1,50 @@
+import json
+from pathlib import Path
+from datetime import datetime
+
+def parse_timestamp(entry):
+    ts = entry.get('fetched_at')
+    if ts:
+        try:
+            return datetime.fromisoformat(ts)
+        except Exception:
+            return None
+    return None
+
+def merge_dict_json(local_path, merge_path, output_path=None, timestamp_field='fetched_at'):
+    """
+    Merge two dict-based JSON files (keyed by id), deduplicate by key.
+    If duplicate, keep entry with latest timestamp_field.
+    """
+    output_path = output_path or local_path
+    with open(local_path, encoding='utf-8') as f:
+        local = json.load(f)
+    with open(merge_path, encoding='utf-8') as f:
+        remote = json.load(f)
+    merged = {}
+    all_keys = set(local.keys()) | set(remote.keys())
+    for key in all_keys:
+        l_entry = local.get(key)
+        r_entry = remote.get(key)
+        if l_entry and r_entry:
+            l_ts = l_entry.get(timestamp_field)
+            r_ts = r_entry.get(timestamp_field)
+            if l_ts and r_ts:
+                merged[key] = l_entry if l_ts > r_ts else r_entry
+            else:
+                merged[key] = l_entry  # fallback
+        else:
+            merged[key] = l_entry or r_entry
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(merged, f, ensure_ascii=False, indent=2)
+    print(f"Merged {len(merged)} unique entries to {output_path}")
+
+if __name__ == "__main__":
+    merge_dict_json(
+        "data/listings.json",
+        "data/to_merge/listings.json"
+    )
+    merge_dict_json(
+        "data/wgcompany_listings.json",
+        "data/to_merge/wgcompany_listings.json"
+    )
diff --git a/helper_functions/merge_listing_times.py b/helper_functions/merge_listing_times.py
new file mode 100644
index 0000000..1e4194f
--- /dev/null
+++ b/helper_functions/merge_listing_times.py
@@ -0,0 +1,34 @@
+import csv
+from pathlib import Path
+
+def merge_listing_times(local_path, merge_path, output_path=None):
+    """
+    Merge two listing_times.csv files, deduplicate by listing_id and timestamp.
+    local_path: main data/listing_times.csv
+    merge_path: data/to_merge/listing_times.csv
+    output_path: where to write merged file (default: overwrite local_path)
+    """
+    output_path = output_path or local_path
+    seen = set()
+    rows = []
+    # Read both files
+    for path in [local_path, merge_path]:
+        with open(path, newline='', encoding='utf-8') as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                key = (row['listing_id'], row['timestamp'])
+                if key not in seen:
+                    seen.add(key)
+                    rows.append(row)
+    # Write merged file
+    with open(output_path, 'w', newline='', encoding='utf-8') as f:
+        writer = csv.DictWriter(f, fieldnames=rows[0].keys())
+        writer.writeheader()
+        writer.writerows(rows)
+    print(f"Merged {len(rows)} unique rows to {output_path}")
+
+if __name__ == "__main__":
+    merge_listing_times(
+        "data/listing_times.csv",
+        "data/to_merge/listing_times.csv"
+    )
diff --git a/helper_functions/merge_wgcompany_times.py b/helper_functions/merge_wgcompany_times.py
new file mode 100644
index 0000000..0c1aa74
--- /dev/null
+++ b/helper_functions/merge_wgcompany_times.py
@@ -0,0 +1,34 @@
+import csv
+from pathlib import Path
+
+def merge_wgcompany_times(local_path, merge_path, output_path=None):
+    """
+    Merge two wgcompany_times.csv files, deduplicate by listing_id and timestamp.
+    local_path: main data/wgcompany_times.csv
+    merge_path: data/to_merge/wgcompany_times.csv
+    output_path: where to write merged file (default: overwrite local_path)
+    """
+    output_path = output_path or local_path
+    seen = set()
+    rows = []
+    # Read both files
+    for path in [local_path, merge_path]:
+        with open(path, newline='', encoding='utf-8') as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                key = (row['listing_id'], row['timestamp'])
+                if key not in seen:
+                    seen.add(key)
+                    rows.append(row)
+    # Write merged file
+    with open(output_path, 'w', newline='', encoding='utf-8') as f:
+        writer = csv.DictWriter(f, fieldnames=rows[0].keys())
+        writer.writeheader()
+        writer.writerows(rows)
+    print(f"Merged {len(rows)} unique rows to {output_path}")
+
+if __name__ == "__main__":
+    merge_wgcompany_times(
+        "data/wgcompany_times.csv",
+        "data/to_merge/wgcompany_times.csv"
+    )