more translation, add unity for defences

2025-10-13 17:20:06 +02:00 · 2025-10-13 17:20:06 +02:00 · 205c953752
commit 205c953752
parent a41be821c1
768 changed files with 75229 additions and 21035 deletions
--- a/scripts/check_translations.py
+++ b/scripts/check_translations.py
@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+"""
+Check which German translation files are actually translated vs. just stubs or copies.
+
+This script compares English and German versions of project posts to identify:
+1. Missing German files
+2. Identical content (likely untranslated)
+3. Stub files with minimal content
+"""
+
+import os
+from pathlib import Path
+import re
+
+# Change to project root
+os.chdir(Path(__file__).parent.parent)
+
+def extract_content(filepath):
+    """Extract the main content (excluding frontmatter) from a markdown file."""
+    with open(filepath, 'r', encoding='utf-8') as f:
+        content = f.read()
+
+    # Split frontmatter from content
+    parts = content.split('+++')
+    if len(parts) >= 3:
+        # Return content after second +++
+        return parts[2].strip()
+    return content.strip()
+
+def extract_title_from_frontmatter(filepath):
+    """Extract title from frontmatter."""
+    with open(filepath, 'r', encoding='utf-8') as f:
+        content = f.read()
+
+    match = re.search(r'title\s*=\s*"([^"]+)"', content)
+    if match:
+        return match.group(1)
+    return None
+
+def similarity_ratio(text1, text2):
+    """Calculate a simple similarity ratio between two texts."""
+    if not text1 or not text2:
+        return 0.0
+
+    # Normalize whitespace
+    text1_norm = ' '.join(text1.split())
+    text2_norm = ' '.join(text2.split())
+
+    if text1_norm == text2_norm:
+        return 1.0
+
+    # Simple character-based similarity
+    longer = max(len(text1_norm), len(text2_norm))
+    if longer == 0:
+        return 1.0
+
+    # Count matching characters
+    matches = sum(c1 == c2 for c1, c2 in zip(text1_norm, text2_norm))
+    return matches / longer
+
+def main():
+    project_dir = Path("content/project")
+
+    missing_german = []
+    untranslated = []
+    stub_files = []
+    properly_translated = []
+
+    # Find all English index.md files
+    for en_file in sorted(project_dir.glob("*/index.md")):
+        project_folder = en_file.parent
+        de_file = project_folder / "index.de.md"
+
+        project_name = project_folder.name
+
+        # Check if German file exists
+        if not de_file.exists():
+            missing_german.append(project_name)
+            continue
+
+        # Extract content
+        en_content = extract_content(en_file)
+        de_content = extract_content(de_file)
+        en_title = extract_title_from_frontmatter(en_file)
+        de_title = extract_title_from_frontmatter(de_file)
+
+        # Check if content is identical or very similar
+        similarity = similarity_ratio(en_content, de_content)
+
+        # Check if German file is a stub (very short content)
+        de_word_count = len(de_content.split())
+
+        if similarity > 0.95:
+            untranslated.append({
+                'name': project_name,
+                'similarity': similarity,
+                'en_title': en_title,
+                'de_title': de_title
+            })
+        elif de_word_count < 20:
+            stub_files.append({
+                'name': project_name,
+                'word_count': de_word_count,
+                'en_title': en_title,
+                'de_title': de_title
+            })
+        else:
+            properly_translated.append({
+                'name': project_name,
+                'similarity': similarity,
+                'word_count': de_word_count
+            })
+
+    # Print results
+    print("=" * 80)
+    print("GERMAN TRANSLATION STATUS REPORT")
+    print("=" * 80)
+    print()
+
+    print(f"📊 SUMMARY")
+    print(f"  Total projects: {len(list(project_dir.glob('*/index.md')))}")
+    print(f"  ✅ Properly translated: {len(properly_translated)}")
+    print(f"  ❌ Missing German file: {len(missing_german)}")
+    print(f"  ⚠️  Untranslated (identical content): {len(untranslated)}")
+    print(f"  ⚠️  Stub files (< 20 words): {len(stub_files)}")
+    print()
+
+    if missing_german:
+        print("=" * 80)
+        print("❌ MISSING GERMAN FILES")
+        print("=" * 80)
+        for project in missing_german:
+            print(f"  • {project}")
+        print()
+
+    if untranslated:
+        print("=" * 80)
+        print("⚠️  UNTRANSLATED (Identical or near-identical to English)")
+        print("=" * 80)
+        for item in untranslated:
+            print(f"  • {item['name']}")
+            print(f"    Similarity: {item['similarity']:.1%}")
+            print(f"    EN title: {item['en_title']}")
+            print(f"    DE title: {item['de_title']}")
+            print()
+
+    if stub_files:
+        print("=" * 80)
+        print("⚠️  STUB FILES (Less than 20 words)")
+        print("=" * 80)
+        for item in stub_files:
+            print(f"  • {item['name']}")
+            print(f"    Word count: {item['word_count']}")
+            print(f"    EN title: {item['en_title']}")
+            print(f"    DE title: {item['de_title']}")
+            print()
+
+    print("=" * 80)
+    print(f"Total needing translation: {len(missing_german) + len(untranslated) + len(stub_files)}")
+    print("=" * 80)
+
+if __name__ == "__main__":
+    main()