#!/usr/bin/env python3 """ Check which German translation files are actually translated vs. just stubs or copies. This script compares English and German versions of project posts to identify: 1. Missing German files 2. Identical content (likely untranslated) 3. Stub files with minimal content """ import os from pathlib import Path import re # Change to project root os.chdir(Path(__file__).parent.parent) def extract_content(filepath): """Extract the main content (excluding frontmatter) from a markdown file.""" with open(filepath, 'r', encoding='utf-8') as f: content = f.read() # Split frontmatter from content parts = content.split('+++') if len(parts) >= 3: # Return content after second +++ return parts[2].strip() return content.strip() def extract_title_from_frontmatter(filepath): """Extract title from frontmatter.""" with open(filepath, 'r', encoding='utf-8') as f: content = f.read() match = re.search(r'title\s*=\s*"([^"]+)"', content) if match: return match.group(1) return None def similarity_ratio(text1, text2): """Calculate a simple similarity ratio between two texts.""" if not text1 or not text2: return 0.0 # Normalize whitespace text1_norm = ' '.join(text1.split()) text2_norm = ' '.join(text2.split()) if text1_norm == text2_norm: return 1.0 # Simple character-based similarity longer = max(len(text1_norm), len(text2_norm)) if longer == 0: return 1.0 # Count matching characters matches = sum(c1 == c2 for c1, c2 in zip(text1_norm, text2_norm)) return matches / longer def main(): project_dir = Path("content/project") missing_german = [] untranslated = [] stub_files = [] properly_translated = [] # Find all English index.md files for en_file in sorted(project_dir.glob("*/index.md")): project_folder = en_file.parent de_file = project_folder / "index.de.md" project_name = project_folder.name # Check if German file exists if not de_file.exists(): missing_german.append(project_name) continue # Extract content en_content = extract_content(en_file) de_content = extract_content(de_file) en_title = extract_title_from_frontmatter(en_file) de_title = extract_title_from_frontmatter(de_file) # Check if content is identical or very similar similarity = similarity_ratio(en_content, de_content) # Check if German file is a stub (very short content) de_word_count = len(de_content.split()) if similarity > 0.95: untranslated.append({ 'name': project_name, 'similarity': similarity, 'en_title': en_title, 'de_title': de_title }) elif de_word_count < 20: stub_files.append({ 'name': project_name, 'word_count': de_word_count, 'en_title': en_title, 'de_title': de_title }) else: properly_translated.append({ 'name': project_name, 'similarity': similarity, 'word_count': de_word_count }) # Print results print("=" * 80) print("GERMAN TRANSLATION STATUS REPORT") print("=" * 80) print() print(f"📊 SUMMARY") print(f" Total projects: {len(list(project_dir.glob('*/index.md')))}") print(f" ✅ Properly translated: {len(properly_translated)}") print(f" ❌ Missing German file: {len(missing_german)}") print(f" ⚠️ Untranslated (identical content): {len(untranslated)}") print(f" ⚠️ Stub files (< 20 words): {len(stub_files)}") print() if missing_german: print("=" * 80) print("❌ MISSING GERMAN FILES") print("=" * 80) for project in missing_german: print(f" • {project}") print() if untranslated: print("=" * 80) print("⚠️ UNTRANSLATED (Identical or near-identical to English)") print("=" * 80) for item in untranslated: print(f" • {item['name']}") print(f" Similarity: {item['similarity']:.1%}") print(f" EN title: {item['en_title']}") print(f" DE title: {item['de_title']}") print() if stub_files: print("=" * 80) print("⚠️ STUB FILES (Less than 20 words)") print("=" * 80) for item in stub_files: print(f" • {item['name']}") print(f" Word count: {item['word_count']}") print(f" EN title: {item['en_title']}") print(f" DE title: {item['de_title']}") print() print("=" * 80) print(f"Total needing translation: {len(missing_german) + len(untranslated) + len(stub_files)}") print("=" * 80) if __name__ == "__main__": main()