import re
import json

def main():
    with open('index.html', 'r', encoding='utf-8') as f:
        content = f.read()
    
    # 1. Extract JSX text: >Text<
    jsx_texts = set(re.findall(r'>\s*([A-ZÁÉÍÓÚÀÂÊÔÃÕÇ][^<]*[a-zA-Z0-9áéíóúàâêôãõç])\s*<', content))
    
    # 2. Extract specific attributes: placeholder="Text", title="Text", label="Text"
    attrs = set()
    for attr in ['placeholder', 'title', 'label']:
        attrs.update(re.findall(rf'{attr}=["\']([^"\']+)["\']', content))
        
    # 3. Extract some specific text patterns that might be missed
    # E.g., strings in JSX logic: {status === 'Pago' ... }
    # Or in JS alerts: confirm('Tem a certeza...')
    
    all_strings = sorted(list(jsx_texts | attrs))
    
    # Filter out obvious non-translatable strings like URLs, single chars, pure numbers
    translatable = []
    for s in all_strings:
        if len(s) < 3: continue
        if re.match(r'^[0-9\W]+$', s): continue
        if 'http' in s or '.com' in s or '://' in s: continue
        # Ignore things that look like React variables {foo}
        if re.match(r'\{[a-zA-Z0-9_]+\}', s): continue
        translatable.append(s)
        
    with open('to_translate.json', 'w', encoding='utf-8') as f:
        json.dump(translatable, f, indent=2, ensure_ascii=False)
        
if __name__ == '__main__':
    main()