after merge
This commit is contained in:
parent
55a6ddb819
commit
deb074f074
4 changed files with 171 additions and 0 deletions
53
helper_functions/merge_applications.py
Normal file
53
helper_functions/merge_applications.py
Normal file
|
|
@ -0,0 +1,53 @@
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
def parse_timestamp(entry):
|
||||||
|
ts = entry.get('timestamp')
|
||||||
|
if ts:
|
||||||
|
try:
|
||||||
|
return datetime.fromisoformat(ts)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
|
def merge_applications(local_path, merge_path, output_path=None):
|
||||||
|
"""
|
||||||
|
Merge two applications.json files, deduplicate by listing_id.
|
||||||
|
If duplicate, keep entry with more fields, or latest timestamp.
|
||||||
|
"""
|
||||||
|
output_path = output_path or local_path
|
||||||
|
with open(local_path, encoding='utf-8') as f:
|
||||||
|
local = json.load(f)
|
||||||
|
with open(merge_path, encoding='utf-8') as f:
|
||||||
|
remote = json.load(f)
|
||||||
|
merged = {}
|
||||||
|
all_keys = set(local.keys()) | set(remote.keys())
|
||||||
|
for key in all_keys:
|
||||||
|
l_entry = local.get(key)
|
||||||
|
r_entry = remote.get(key)
|
||||||
|
if l_entry and r_entry:
|
||||||
|
# Prefer entry with more fields
|
||||||
|
if len(l_entry) > len(r_entry):
|
||||||
|
merged[key] = l_entry
|
||||||
|
elif len(r_entry) > len(l_entry):
|
||||||
|
merged[key] = r_entry
|
||||||
|
else:
|
||||||
|
# If same length, prefer latest timestamp
|
||||||
|
l_ts = parse_timestamp(l_entry)
|
||||||
|
r_ts = parse_timestamp(r_entry)
|
||||||
|
if l_ts and r_ts:
|
||||||
|
merged[key] = l_entry if l_ts > r_ts else r_entry
|
||||||
|
else:
|
||||||
|
merged[key] = l_entry # fallback
|
||||||
|
else:
|
||||||
|
merged[key] = l_entry or r_entry
|
||||||
|
with open(output_path, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(merged, f, ensure_ascii=False, indent=2)
|
||||||
|
print(f"Merged {len(merged)} unique applications to {output_path}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
merge_applications(
|
||||||
|
"data/applications.json",
|
||||||
|
"data/to_merge/applications.json"
|
||||||
|
)
|
||||||
50
helper_functions/merge_dict_json.py
Normal file
50
helper_functions/merge_dict_json.py
Normal file
|
|
@ -0,0 +1,50 @@
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
def parse_timestamp(entry):
|
||||||
|
ts = entry.get('fetched_at')
|
||||||
|
if ts:
|
||||||
|
try:
|
||||||
|
return datetime.fromisoformat(ts)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
|
def merge_dict_json(local_path, merge_path, output_path=None, timestamp_field='fetched_at'):
|
||||||
|
"""
|
||||||
|
Merge two dict-based JSON files (keyed by id), deduplicate by key.
|
||||||
|
If duplicate, keep entry with latest timestamp_field.
|
||||||
|
"""
|
||||||
|
output_path = output_path or local_path
|
||||||
|
with open(local_path, encoding='utf-8') as f:
|
||||||
|
local = json.load(f)
|
||||||
|
with open(merge_path, encoding='utf-8') as f:
|
||||||
|
remote = json.load(f)
|
||||||
|
merged = {}
|
||||||
|
all_keys = set(local.keys()) | set(remote.keys())
|
||||||
|
for key in all_keys:
|
||||||
|
l_entry = local.get(key)
|
||||||
|
r_entry = remote.get(key)
|
||||||
|
if l_entry and r_entry:
|
||||||
|
l_ts = l_entry.get(timestamp_field)
|
||||||
|
r_ts = r_entry.get(timestamp_field)
|
||||||
|
if l_ts and r_ts:
|
||||||
|
merged[key] = l_entry if l_ts > r_ts else r_entry
|
||||||
|
else:
|
||||||
|
merged[key] = l_entry # fallback
|
||||||
|
else:
|
||||||
|
merged[key] = l_entry or r_entry
|
||||||
|
with open(output_path, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(merged, f, ensure_ascii=False, indent=2)
|
||||||
|
print(f"Merged {len(merged)} unique entries to {output_path}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
merge_dict_json(
|
||||||
|
"data/listings.json",
|
||||||
|
"data/to_merge/listings.json"
|
||||||
|
)
|
||||||
|
merge_dict_json(
|
||||||
|
"data/wgcompany_listings.json",
|
||||||
|
"data/to_merge/wgcompany_listings.json"
|
||||||
|
)
|
||||||
34
helper_functions/merge_listing_times.py
Normal file
34
helper_functions/merge_listing_times.py
Normal file
|
|
@ -0,0 +1,34 @@
|
||||||
|
import csv
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def merge_listing_times(local_path, merge_path, output_path=None):
|
||||||
|
"""
|
||||||
|
Merge two listing_times.csv files, deduplicate by listing_id and timestamp.
|
||||||
|
local_path: main data/listing_times.csv
|
||||||
|
merge_path: data/to_merge/listing_times.csv
|
||||||
|
output_path: where to write merged file (default: overwrite local_path)
|
||||||
|
"""
|
||||||
|
output_path = output_path or local_path
|
||||||
|
seen = set()
|
||||||
|
rows = []
|
||||||
|
# Read both files
|
||||||
|
for path in [local_path, merge_path]:
|
||||||
|
with open(path, newline='', encoding='utf-8') as f:
|
||||||
|
reader = csv.DictReader(f)
|
||||||
|
for row in reader:
|
||||||
|
key = (row['listing_id'], row['timestamp'])
|
||||||
|
if key not in seen:
|
||||||
|
seen.add(key)
|
||||||
|
rows.append(row)
|
||||||
|
# Write merged file
|
||||||
|
with open(output_path, 'w', newline='', encoding='utf-8') as f:
|
||||||
|
writer = csv.DictWriter(f, fieldnames=rows[0].keys())
|
||||||
|
writer.writeheader()
|
||||||
|
writer.writerows(rows)
|
||||||
|
print(f"Merged {len(rows)} unique rows to {output_path}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
merge_listing_times(
|
||||||
|
"data/listing_times.csv",
|
||||||
|
"data/to_merge/listing_times.csv"
|
||||||
|
)
|
||||||
34
helper_functions/merge_wgcompany_times.py
Normal file
34
helper_functions/merge_wgcompany_times.py
Normal file
|
|
@ -0,0 +1,34 @@
|
||||||
|
import csv
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def merge_wgcompany_times(local_path, merge_path, output_path=None):
|
||||||
|
"""
|
||||||
|
Merge two wgcompany_times.csv files, deduplicate by listing_id and timestamp.
|
||||||
|
local_path: main data/wgcompany_times.csv
|
||||||
|
merge_path: data/to_merge/wgcompany_times.csv
|
||||||
|
output_path: where to write merged file (default: overwrite local_path)
|
||||||
|
"""
|
||||||
|
output_path = output_path or local_path
|
||||||
|
seen = set()
|
||||||
|
rows = []
|
||||||
|
# Read both files
|
||||||
|
for path in [local_path, merge_path]:
|
||||||
|
with open(path, newline='', encoding='utf-8') as f:
|
||||||
|
reader = csv.DictReader(f)
|
||||||
|
for row in reader:
|
||||||
|
key = (row['listing_id'], row['timestamp'])
|
||||||
|
if key not in seen:
|
||||||
|
seen.add(key)
|
||||||
|
rows.append(row)
|
||||||
|
# Write merged file
|
||||||
|
with open(output_path, 'w', newline='', encoding='utf-8') as f:
|
||||||
|
writer = csv.DictWriter(f, fieldnames=rows[0].keys())
|
||||||
|
writer.writeheader()
|
||||||
|
writer.writerows(rows)
|
||||||
|
print(f"Merged {len(rows)} unique rows to {output_path}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
merge_wgcompany_times(
|
||||||
|
"data/wgcompany_times.csv",
|
||||||
|
"data/to_merge/wgcompany_times.csv"
|
||||||
|
)
|
||||||
Loading…
Add table
Add a link
Reference in a new issue