163 lines
4.9 KiB
Python
163 lines
4.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Check which German translation files are actually translated vs. just stubs or copies.
|
|
|
|
This script compares English and German versions of project posts to identify:
|
|
1. Missing German files
|
|
2. Identical content (likely untranslated)
|
|
3. Stub files with minimal content
|
|
"""
|
|
|
|
import os
|
|
from pathlib import Path
|
|
import re
|
|
|
|
# Change to project root
|
|
os.chdir(Path(__file__).parent.parent)
|
|
|
|
def extract_content(filepath):
|
|
"""Extract the main content (excluding frontmatter) from a markdown file."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Split frontmatter from content
|
|
parts = content.split('+++')
|
|
if len(parts) >= 3:
|
|
# Return content after second +++
|
|
return parts[2].strip()
|
|
return content.strip()
|
|
|
|
def extract_title_from_frontmatter(filepath):
|
|
"""Extract title from frontmatter."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
match = re.search(r'title\s*=\s*"([^"]+)"', content)
|
|
if match:
|
|
return match.group(1)
|
|
return None
|
|
|
|
def similarity_ratio(text1, text2):
|
|
"""Calculate a simple similarity ratio between two texts."""
|
|
if not text1 or not text2:
|
|
return 0.0
|
|
|
|
# Normalize whitespace
|
|
text1_norm = ' '.join(text1.split())
|
|
text2_norm = ' '.join(text2.split())
|
|
|
|
if text1_norm == text2_norm:
|
|
return 1.0
|
|
|
|
# Simple character-based similarity
|
|
longer = max(len(text1_norm), len(text2_norm))
|
|
if longer == 0:
|
|
return 1.0
|
|
|
|
# Count matching characters
|
|
matches = sum(c1 == c2 for c1, c2 in zip(text1_norm, text2_norm))
|
|
return matches / longer
|
|
|
|
def main():
|
|
project_dir = Path("content/project")
|
|
|
|
missing_german = []
|
|
untranslated = []
|
|
stub_files = []
|
|
properly_translated = []
|
|
|
|
# Find all English index.md files
|
|
for en_file in sorted(project_dir.glob("*/index.md")):
|
|
project_folder = en_file.parent
|
|
de_file = project_folder / "index.de.md"
|
|
|
|
project_name = project_folder.name
|
|
|
|
# Check if German file exists
|
|
if not de_file.exists():
|
|
missing_german.append(project_name)
|
|
continue
|
|
|
|
# Extract content
|
|
en_content = extract_content(en_file)
|
|
de_content = extract_content(de_file)
|
|
en_title = extract_title_from_frontmatter(en_file)
|
|
de_title = extract_title_from_frontmatter(de_file)
|
|
|
|
# Check if content is identical or very similar
|
|
similarity = similarity_ratio(en_content, de_content)
|
|
|
|
# Check if German file is a stub (very short content)
|
|
de_word_count = len(de_content.split())
|
|
|
|
if similarity > 0.95:
|
|
untranslated.append({
|
|
'name': project_name,
|
|
'similarity': similarity,
|
|
'en_title': en_title,
|
|
'de_title': de_title
|
|
})
|
|
elif de_word_count < 20:
|
|
stub_files.append({
|
|
'name': project_name,
|
|
'word_count': de_word_count,
|
|
'en_title': en_title,
|
|
'de_title': de_title
|
|
})
|
|
else:
|
|
properly_translated.append({
|
|
'name': project_name,
|
|
'similarity': similarity,
|
|
'word_count': de_word_count
|
|
})
|
|
|
|
# Print results
|
|
print("=" * 80)
|
|
print("GERMAN TRANSLATION STATUS REPORT")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
print(f"📊 SUMMARY")
|
|
print(f" Total projects: {len(list(project_dir.glob('*/index.md')))}")
|
|
print(f" ✅ Properly translated: {len(properly_translated)}")
|
|
print(f" ❌ Missing German file: {len(missing_german)}")
|
|
print(f" ⚠️ Untranslated (identical content): {len(untranslated)}")
|
|
print(f" ⚠️ Stub files (< 20 words): {len(stub_files)}")
|
|
print()
|
|
|
|
if missing_german:
|
|
print("=" * 80)
|
|
print("❌ MISSING GERMAN FILES")
|
|
print("=" * 80)
|
|
for project in missing_german:
|
|
print(f" • {project}")
|
|
print()
|
|
|
|
if untranslated:
|
|
print("=" * 80)
|
|
print("⚠️ UNTRANSLATED (Identical or near-identical to English)")
|
|
print("=" * 80)
|
|
for item in untranslated:
|
|
print(f" • {item['name']}")
|
|
print(f" Similarity: {item['similarity']:.1%}")
|
|
print(f" EN title: {item['en_title']}")
|
|
print(f" DE title: {item['de_title']}")
|
|
print()
|
|
|
|
if stub_files:
|
|
print("=" * 80)
|
|
print("⚠️ STUB FILES (Less than 20 words)")
|
|
print("=" * 80)
|
|
for item in stub_files:
|
|
print(f" • {item['name']}")
|
|
print(f" Word count: {item['word_count']}")
|
|
print(f" EN title: {item['en_title']}")
|
|
print(f" DE title: {item['de_title']}")
|
|
print()
|
|
|
|
print("=" * 80)
|
|
print(f"Total needing translation: {len(missing_german) + len(untranslated) + len(stub_files)}")
|
|
print("=" * 80)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|