import re import json def main(): with open('index.html', 'r', encoding='utf-8') as f: content = f.read() # 1. Extract JSX text: >Text< jsx_texts = set(re.findall(r'>\s*([A-ZÁÉÍÓÚÀÂÊÔÃÕÇ][^<]*[a-zA-Z0-9áéíóúàâêôãõç])\s*<', content)) # 2. Extract specific attributes: placeholder="Text", title="Text", label="Text" attrs = set() for attr in ['placeholder', 'title', 'label']: attrs.update(re.findall(rf'{attr}=["\']([^"\']+)["\']', content)) # 3. Extract some specific text patterns that might be missed # E.g., strings in JSX logic: {status === 'Pago' ... } # Or in JS alerts: confirm('Tem a certeza...') all_strings = sorted(list(jsx_texts | attrs)) # Filter out obvious non-translatable strings like URLs, single chars, pure numbers translatable = [] for s in all_strings: if len(s) < 3: continue if re.match(r'^[0-9\W]+$', s): continue if 'http' in s or '.com' in s or '://' in s: continue # Ignore things that look like React variables {foo} if re.match(r'\{[a-zA-Z0-9_]+\}', s): continue translatable.append(s) with open('to_translate.json', 'w', encoding='utf-8') as f: json.dump(translatable, f, indent=2, ensure_ascii=False) if __name__ == '__main__': main()