LearnIT/lib/core/services/materials_rag_service.dart

import 'package:cloud_firestore/cloud_firestore.dart';
import 'package:firebase_auth/firebase_auth.dart';
import 'package:firebase_storage/firebase_storage.dart';
import 'package:syncfusion_flutter_pdf/pdf.dart';
import '../utils/logger.dart';

/// Service for RAG chunk retrieval from teacher PDFs
/// CORRETO: Divide PDFs em chunks e seleciona relevantes por keyword matching
class MaterialsRAGService {
  static final FirebaseFirestore _firestore = FirebaseFirestore.instance;
  static final FirebaseStorage _storage = FirebaseStorage.instanceFor(
    bucket: 'teachit-app.firebasestorage.app',
  );
  static final FirebaseAuth _auth = FirebaseAuth.instance;

  /// Cache de chunks extraídos dos PDFs: {fileName: [chunk1, chunk2, ...]}
  static final Map<String, List<String>> _chunksCache = {};

  /// Número máximo de janelas de contexto a enviar ao modelo
  static const int _maxRelevantChunks = 5;

  /// Listar materiais disponíveis para o aluno autenticado
  /// Retorna apenas materiais cujo classId corresponde a uma turma onde o aluno está inscrito
  static Future<List<Map<String, String>>>
  getAvailableMaterialsForStudent() async {
    try {
      final user = _auth.currentUser;
      if (user == null) return [];

      final uid = user.uid;

      // 1. Buscar classIds das inscrições do aluno
      final enrollmentSnapshot = await _firestore
          .collection('enrollments')
          .where('studentId', isEqualTo: uid)
          .get();

      final enrolledClassIds = enrollmentSnapshot.docs
          .map((doc) => doc.data()['classId'] as String?)
          .where((id) => id != null)
          .cast<String>()
          .toSet();

      if (enrolledClassIds.isEmpty) return [];

      // 2. Buscar teacher IDs dessas turmas
      final teacherIds = await _getTeacherIdsForStudent(uid);
      if (teacherIds.isEmpty) return [];

      // 3. Buscar todos os materiais desses professores
      final teacherIdList = teacherIds.take(10).toList();
      final snapshot = await _firestore
          .collection('materials')
          .where('teacherId', whereIn: teacherIdList)
          .orderBy('createdAt', descending: true)
          .get();

      // 4. Filtrar: manter apenas materiais cujo classId está nas turmas do aluno
      //    ou materiais sem classId (compatibilidade com uploads antigos)
      final result = <Map<String, String>>[];
      for (final doc in snapshot.docs) {
        final data = doc.data();
        final classId = data['classId'] as String?;
        if (classId == null || enrolledClassIds.contains(classId)) {
          final fileName = data['fileName'] as String? ?? 'Material';
          final teacherId = data['teacherId'] as String?;
          final url = data['url'] as String?;
          result.add({
            'id': doc.id,
            'name': fileName,
            if (classId != null) 'classId': classId,
            if (teacherId != null) 'teacherId': teacherId,
            if (url != null) 'url': url,
          });
        }
      }

      Logger.info('Available materials for student: ${result.length}');
      return result;
    } catch (e) {
      Logger.error('Error getting available materials for student: $e');
      return [];
    }
  }

  /// RAG CHUNK RETRIEVAL - Versão correta
  /// Busca chunks relevantes dos PDFs com base na query do usuário
  /// Se [selectedMaterialIds] for fornecido e não vazio, filtra apenas esses materiais
  /// Se [filterTableData] for true, remove dados de tabelas/gráficos do conteúdo
  static Future<String> getRelevantChunks({
    required String userQuery,
    int maxMaterials = 5,
    int maxChunks = 5,
    List<String>? selectedMaterialIds,
    bool filterTableData = false,
  }) async {
    try {
      final user = _auth.currentUser;
      if (user == null) {
        Logger.warning('No authenticated user for materials context');
        return '';
      }

      if (selectedMaterialIds != null && selectedMaterialIds.isNotEmpty) {
        // Usar apenas os materiais selecionados pelo aluno
        Logger.info('Fetching selected materials: $selectedMaterialIds');
        final batches = <Future<QuerySnapshot>>[];
        for (int i = 0; i < selectedMaterialIds.length; i += 10) {
          final batch = selectedMaterialIds.skip(i).take(10).toList();
          batches.add(
            _firestore
                .collection('materials')
                .where(FieldPath.documentId, whereIn: batch)
                .get(),
          );
        }
        final results = await Future.wait(batches);
        final allDocs = results.expand((s) => s.docs).toList();
        Logger.info('Selected materials found: ${allDocs.length}');

        // Processar directamente — sem chunking para não triplicar o texto em memória
        final contextBuffer = StringBuffer();
        contextBuffer.writeln('Contexto dos materiais do professor:');
        bool hasContent = false;
        for (final doc in allDocs) {
          final data = doc.data() as Map<String, dynamic>;
          final fileName = data['fileName'] as String?;
          if (fileName == null) continue;
          if (!fileName.toLowerCase().endsWith('.pdf')) continue;

          // Usar cache do texto completo se disponível (sufixo v2 invalida caches antigos)
          final cacheKey = '${fileName}_v6';
          String fullText;
          if (_chunksCache.containsKey(cacheKey) &&
              _chunksCache[cacheKey]!.isNotEmpty) {
            fullText = _chunksCache[cacheKey]!.first;
            Logger.info(
              'Using cached text for $fileName: ${fullText.length} chars',
            );
          } else {
            try {
              final teacherId = data['teacherId'] as String?;
              if (teacherId == null) continue;
              final rawText = await _extractFullText(fileName, teacherId);
              if (rawText.isEmpty) continue;
              // Colapsar whitespace excessivo (PDFs de layout decorativo geram muitos \n)
              String cleaned = rawText
                  .replaceAll(RegExp(r'[ \t]+'), ' ')
                  .replaceAll(RegExp(r'\n{2,}'), '\n')
                  .trim();
              // Tentar corrigir encoding LaTeX corrompido (Type1/OTF sem mapeamento Unicode)
              cleaned = cleaned
                  .replaceAll('¸c˜ao', 'ção')
                  .replaceAll('˜ao', 'ão')
                  .replaceAll('¸c˜oes', 'ções')
                  .replaceAll('˜oes', 'ões')
                  .replaceAll('¸c', 'ç')
                  .replaceAll('´a', 'á')
                  .replaceAll('´e', 'é')
                  .replaceAll('´i', 'í')
                  .replaceAll('´o', 'ó')
                  .replaceAll('´u', 'ú')
                  .replaceAll('ˆa', 'â')
                  .replaceAll('ˆe', 'ê')
                  .replaceAll('ˆo', 'ô')
                  .replaceAll('`a', 'à');
              // Reconstruir espaços em texto colado (LaTeX sem ToUnicode map):
              // inserir espaço antes de maiúscula precedida de minúscula/dígito
              cleaned = cleaned.replaceAllMapped(
                RegExp(r'([a-záéíóúàâêôãõç\d])([A-ZÁÉÍÓÚÀÂÊÔÃÕÇ])'),
                (m) => '${m.group(1)} ${m.group(2)}',
              );
              // inserir espaço entre dígito e letra
              cleaned = cleaned.replaceAllMapped(
                RegExp(r'(\d)([A-Za-záéíóúàâêôãõç])'),
                (m) => '${m.group(1)} ${m.group(2)}',
              );
              fullText = cleaned;
              // Guardar texto completo no cache com key versionada
              _chunksCache[cacheKey] = [fullText];
              Logger.info(
                'PDF "$fileName" -> ${fullText.length} chars extracted',
              );
            } catch (e) {
              Logger.error('Error extracting text from $fileName: $e');
              continue;
            }
          }

          // PDFs pequenos: enviar texto completo (formulários, notas, etc.)
          // PDFs grandes: keyword window search para não sobrecarregar o modelo
          String context;
          if (fullText.length <= 10000) {
            context = fullText;
            Logger.info(
              'Small PDF — sending full text (${fullText.length} chars)',
            );
          } else {
            final windows = _extractKeywordWindows(
              fullText,
              userQuery,
              _maxRelevantChunks,
            );
            context = windows.join('\n\n---\n\n');
            Logger.info('Large PDF — keyword windows: ${windows.length}');
          }

          // Filter table data if requested (for math subjects)
          if (filterTableData) {
            context = _filterTableData(context);
            Logger.info('Filtered table data from content');
          }

          if (context.isNotEmpty) {
            contextBuffer.writeln('\n[MATERIAL: $fileName]');
            contextBuffer.writeln(context);
            hasContent = true;
          }
        }
        if (!hasContent) return '';
        return contextBuffer.toString();
      }

      // Sem material seleccionado — não processar PDFs automaticamente
      // O utilizador deve seleccionar um material antes de fazer perguntas sobre conteúdo
      Logger.info('No selectedMaterialIds — skipping automatic PDF processing');
      return '';
    } catch (e) {
      Logger.error('Error in RAG chunk retrieval: $e');
      return '';
    }
  }

  /// Método legacy - mantido para compatibilidade mas usa chunk retrieval
  @Deprecated('Use getRelevantChunks with userQuery instead')
  static Future<String> getMaterialsContext({int maxMaterials = 5}) async {
    return getRelevantChunks(
      userQuery: '',
      maxMaterials: maxMaterials,
      maxChunks: 3,
    );
  }

  /// Get teacher IDs from student's enrolled classes
  /// Busca inscrições do estudante e obtém teacherIds das turmas
  static Future<List<String>> _getTeacherIdsForStudent(String studentId) async {
    try {
      // 1. Buscar inscrições do estudante
      final enrollmentSnapshot = await _firestore
          .collection('enrollments')
          .where('studentId', isEqualTo: studentId)
          .get();

      if (enrollmentSnapshot.docs.isEmpty) {
        Logger.info('No enrollments found for student $studentId');
        return [];
      }

      // 2. Extrair classIds das inscrições
      final classIds = enrollmentSnapshot.docs
          .map((doc) => doc.data()['classId'] as String?)
          .where((id) => id != null)
          .cast<String>()
          .toList();

      if (classIds.isEmpty) {
        Logger.info('No class IDs found in enrollments');
        return [];
      }

      Logger.info('Found ${classIds.length} classes for student');

      // 3. Buscar turmas e extrair teacherIds
      final Set<String> teacherIds = {};

      // Firestore whereIn limit is 10, so process in batches if needed
      for (int i = 0; i < classIds.length; i += 10) {
        final batch = classIds.skip(i).take(10).toList();

        final classSnapshot = await _firestore
            .collection('classes')
            .where(FieldPath.documentId, whereIn: batch)
            .get();

        for (final doc in classSnapshot.docs) {
          final teacherId = doc.data()['teacherId'] as String?;
          if (teacherId != null && teacherId.isNotEmpty) {
            teacherIds.add(teacherId);
          }
        }
      }

      Logger.info('Found ${teacherIds.length} unique teachers');
      return teacherIds.toList();
    } catch (e) {
      Logger.error('Error getting teacher IDs for student: $e');
      return [];
    }
  }

  /// Limite máximo de bytes descarregados do PDF via Firebase Storage (10 MB)
  static const int _maxPdfBytes = 10 * 1024 * 1024;

  /// Limite máximo de caracteres de texto extraído do PDF completo (para chunking)
  static const int _maxExtractedChars = 50000;

  /// Extrair texto real do PDF usando Firebase Storage SDK + syncfusion_flutter_pdf
  /// Usa getData() para descarregar o ficheiro completo (sem truncar a meio do stream)
  static Future<String> _extractFullText(
    String fileName,
    String teacherId,
  ) async {
    PdfDocument? document;
    try {
      final ref = _storage
          .ref()
          .child('teachers')
          .child(teacherId)
          .child('materials')
          .child(fileName);

      Logger.info('PDF available for extraction: $fileName');

      // getData descarrega o ficheiro completo de forma gerida pelo SDK do Firebase
      // O PDF nunca é truncado a meio — recebemos sempre um ficheiro válido
      final data = await ref.getData(_maxPdfBytes);
      if (data == null || data.isEmpty) {
        Logger.warning('No data received for $fileName');
        return '';
      }

      Logger.info('Downloaded ${data.length} bytes for $fileName');

      // Extrair texto real com PdfDocument
      document = PdfDocument(inputBytes: data);
      final buffer = StringBuffer();

      // 1. Extrair texto de todas as páginas — salta apenas páginas de estrutura
      final extractor = PdfTextExtractor(document);
      final totalPages = document.pages.count;
      final startPage = totalPages > 4 ? 2 : 0;
      for (int i = startPage; i < totalPages; i++) {
        if (buffer.length >= _maxExtractedChars) break;
        try {
          final pageText = extractor
              .extractText(startPageIndex: i, endPageIndex: i)
              .trim();
          if (pageText.length < 80) continue;
          final lowerText = pageText.toLowerCase();
          final pipeCount = '|'.allMatches(pageText).length;
          final isStructurePage =
              pipeCount > 3 ||
              (lowerText.contains('table of contents') &&
                  pageText.length < 800) ||
              (lowerText.contains('copyright') && pageText.length < 400) ||
              (lowerText.contains('color insert') && pageText.length < 400) ||
              lowerText.contains('just light novels') ||
              lowerText.contains('download all your fav') ||
              (lowerText.contains('www.') && pageText.length < 300);
          if (isStructurePage) continue;
          buffer.writeln(pageText);
        } catch (_) {}
      }

      // 2. Extrair valores dos campos de formulário (se existirem)
      final form = document.form;
      if (form.fields.count > 0) {
        buffer.writeln('\n[CAMPOS DO FORMULÁRIO]');
        for (int i = 0; i < form.fields.count; i++) {
          if (buffer.length >= _maxExtractedChars) break;
          final field = form.fields[i];
          final name = field.name;
          String value = '';
          if (field is PdfTextBoxField) {
            value = field.text;
          } else if (field is PdfComboBoxField) {
            value = field.selectedValue;
          } else if (field is PdfListBoxField) {
            value = field.selectedValues.join(', ');
          } else if (field is PdfRadioButtonListField) {
            value = field.selectedValue;
          } else if (field is PdfCheckBoxField) {
            value = field.isChecked ? 'Sim' : 'Não';
          }
          if ((name?.isNotEmpty ?? false) || value.isNotEmpty) {
            buffer.writeln('$name: $value');
          }
        }
      }

      final fullText = buffer.toString();

      // Truncar ao limite
      final result = fullText.length > _maxExtractedChars
          ? fullText.substring(0, _maxExtractedChars)
          : fullText;

      Logger.info(
        'Extracted ${result.length} chars from $fileName (${document.pages.count} pages, ${form.fields.count} form fields)',
      );
      Logger.info(
        'Text preview: ${result.length > 200 ? result.substring(0, 200) : result}',
      );
      return result.trim();
    } catch (e) {
      Logger.error('Error extracting text from $fileName: $e');
      return '';
    } finally {
      document?.dispose();
    }
  }

  /// Keyword window search — encontra posições das keywords no texto e extrai
  /// janelas de contexto em redor. Nunca aloca chunks — opera sobre a string original.
  static List<String> _extractKeywordWindows(
    String text,
    String userQuery,
    int maxWindows, {
    int windowSize = 1200,
  }) {
    // Extrair keywords: palavras com >3 chars + nomes próprios (palavras com maiúscula, >2 chars)
    // Os nomes próprios são invariantes entre línguas (ex: "Claire", "Rae", "François")
    final properNouns = RegExp(
      r'\b[A-ZÁÉÍÓÚÀÂÊÔÃÕÇ][a-záéíóúàâêôãõç]{2,}\b',
    ).allMatches(userQuery).map((m) => m.group(0)!.toLowerCase()).toSet();
    final generalKeywords = userQuery
        .toLowerCase()
        .split(RegExp(r'[^\w]'))
        .where((w) => w.length > 3)
        .toSet();
    final keywords = {...properNouns, ...generalKeywords};

    if (keywords.isEmpty) {
      return [text.length > windowSize ? text.substring(0, windowSize) : text];
    }

    final textLower = text.toLowerCase();
    // Recolher posições únicas onde alguma keyword aparece
    final positions = <int>{};
    for (final kw in keywords) {
      int idx = textLower.indexOf(kw);
      while (idx != -1) {
        positions.add(idx);
        idx = textLower.indexOf(kw, idx + 1);
      }
    }

    if (positions.isEmpty) {
      // Sem matches — retornar porção do início do conteúdo real (saltar ~10% de índice/capa)
      final skip = (text.length * 0.05).toInt().clamp(0, 2000);
      final end = (skip + windowSize * maxWindows).clamp(0, text.length);
      return [text.substring(skip, end).trim()];
    }

    // Ordenar posições e fundir janelas sobrepostas
    final sorted = positions.toList()..sort();
    final windows = <String>[];
    int lastEnd = -1;

    for (final pos in sorted) {
      if (windows.length >= maxWindows) break;
      final start = (pos - windowSize ~/ 2).clamp(0, text.length);
      final end = (pos + windowSize ~/ 2).clamp(0, text.length);
      if (start < lastEnd) continue; // Janela sobreposta — saltar
      windows.add(text.substring(start, end).trim());
      lastEnd = end;
    }

    Logger.info(
      'Keyword windows found: ${windows.length} for query "$userQuery"',
    );
    return windows;
  }

  /// Dividir texto em chunks com overlap
  static List<String> _chunkText(String text, int chunkSize, int overlap) {
    final List<String> chunks = [];
    final int textLength = text.length;

    if (textLength <= chunkSize) {
      return [text];
    }

    int start = 0;
    while (start < textLength) {
      int end = start + chunkSize;

      if (end >= textLength) {
        end = textLength;
      } else {
        // Tentar quebrar num espaço para não cortar palavras
        while (end > start && text[end] != ' ' && text[end] != '\n') {
          end--;
        }
        if (end == start) {
          end = start + chunkSize; // Forçar quebra se não encontrar espaço
        }
      }

      chunks.add(text.substring(start, end).trim());

      // Avançar com overlap
      start = end - overlap;
      if (start >= end) break; // Prevenir loop infinito
    }

    return chunks;
  }

  /// Selecionar chunks mais relevantes usando keyword matching simples
  static List<String> _selectRelevantChunks(
    List<String> chunks,
    String userQuery,
    int maxChunks,
  ) {
    if (userQuery.isEmpty || chunks.isEmpty) {
      // Se não há query, retornar primeiros chunks
      return chunks.take(maxChunks).toList();
    }

    // Extrair keywords da query (palavras com mais de 3 caracteres)
    final queryWords = userQuery
        .toLowerCase()
        .split(RegExp(r'[^\w]'))
        .where((w) => w.length > 3)
        .toSet();

    if (queryWords.isEmpty) {
      return chunks.take(maxChunks).toList();
    }

    // Calcular score para cada chunk
    final List<MapEntry<String, int>> scoredChunks = [];

    for (final chunk in chunks) {
      final chunkLower = chunk.toLowerCase();
      int score = 0;

      for (final word in queryWords) {
        // Contar ocorrências da palavra no chunk
        final matches = word.allMatches(chunkLower).length;
        score += matches * 10; // Peso por ocorrência

        // Bonus se a palavra estiver no início do chunk
        if (chunkLower.startsWith(word)) {
          score += 5;
        }
      }

      // Bonus por tamanho do chunk (preferir chunks mais completos)
      score += (chunk.length / 100).floor();

      scoredChunks.add(MapEntry(chunk, score));
    }

    // Ordenar por score decrescente
    scoredChunks.sort((a, b) => b.value.compareTo(a.value));

    Logger.info(
      'Top chunk scores: ${scoredChunks.take(3).map((e) => e.value).toList()}',
    );

    // Retornar os N chunks mais relevantes
    return scoredChunks.take(maxChunks).map((e) => e.key).toList();
  }

  /// Formatar texto extraído do PDF para melhor legibilidade
  static String _formatPDFText(String text) {
    if (text.isEmpty) return text;

    String formatted = text;

    // Corrigir quebras de linha excessivas
    formatted = formatted.replaceAll(RegExp(r'\n{3,}'), '\n\n');

    // Corrigir espaços excessivos
    formatted = formatted.replaceAll(RegExp(r'[ \t]+'), ' ');

    // Remover espaços no início/fim das linhas
    formatted = formatted.split('\n').map((line) => line.trim()).join('\n');

    // Corrigir parágrafos (linhas que terminam com ponto e seguem sem espaço)
    formatted = formatted.replaceAllMapped(
      RegExp(r'\.(\n)([A-ZÁÉÍÓÚÀÂÊÔÃÕÇ])'),
      (match) => '.\n\n${match.group(2)}',
    );

    // Corrigir quebras de palavras com hífen no fim da linha
    formatted = formatted.replaceAllMapped(
      RegExp(r'([a-zA-Záéíóúàâêôãõç])-\n([a-zA-Záéíóúàâêôãõç])'),
      (match) => '${match.group(1)}${match.group(2)}',
    );

    // Adicionar quebras de parágrafo para títulos (linhas em maiúsculas)
    formatted = formatted.replaceAllMapped(
      RegExp(r'\n([A-ZÁÉÍÓÚÀÂÊÔÃÕÇ][A-ZÁÉÍÓÚÀÂÊÔÃÕÇ\s]{10,})\n'),
      (match) => '\n\n${match.group(1)}\n\n',
    );

    // Limpar quebras de linha no início e fim
    formatted = formatted.trim();

    return formatted;
  }

  /// Obter o texto completo de um PDF específico para pré-visualização
  static Future<String> getFullPDFText(
    String fileName,
    String teacherId,
  ) async {
    try {
      // Remover extensão se existir
      final cleanFileName = fileName.endsWith('.pdf')
          ? fileName
          : '$fileName.pdf';

      // Usar cache do texto completo se disponível
      final cacheKey = '${cleanFileName}_preview_v6';
      if (_chunksCache.containsKey(cacheKey) &&
          _chunksCache[cacheKey]!.isNotEmpty) {
        final fullText = _chunksCache[cacheKey]!.first;
        Logger.info(
          'Using cached preview text for $cleanFileName: ${fullText.length} chars',
        );
        return fullText;
      }

      // Extrair texto completo
      final rawText = await _extractFullText(cleanFileName, teacherId);

      // Formatar texto para melhor legibilidade
      final formattedText = _formatPDFText(rawText);

      // Guardar em cache
      _chunksCache[cacheKey] = [formattedText];

      Logger.info(
        'PDF "$cleanFileName" -> ${formattedText.length} chars extracted and formatted for preview',
      );
      return formattedText;
    } catch (e) {
      Logger.error('Error getting full PDF text for $fileName: $e');
      return '';
    }
  }

  /// Clear the chunks cache
  static void clearCache() {
    _chunksCache.clear();
    Logger.info('Materials chunks cache cleared');
  }

  /// Filter out table data from text (for math subjects)
  /// Removes lines that look like tabular data with multiple numbers
  static String _filterTableData(String text) {
    final lines = text.split('\n');
    final filtered = <String>[];

    for (final line in lines) {
      final trimmed = line.trim();

      // Skip lines that look like table data
      // Pattern: multiple numbers separated by spaces/tabs
      final numberPattern = RegExp(r'\d+\s+\d+');
      final matches = numberPattern.allMatches(trimmed);

      // If a line has 2+ number pairs separated by spaces, it's likely table data
      if (matches.length >= 2) {
        continue;
      }

      // Skip lines with specific date patterns (table data)
      if (RegExp(r'\d{1,2}/\d{1,2}/\d{4}').hasMatch(trimmed) &&
          RegExp(r'\d+').allMatches(trimmed).length > 2) {
        continue;
      }

      // Keep the line
      filtered.add(line);
    }

    return filtered.join('\n');
  }
}