import 'package:cloud_firestore/cloud_firestore.dart'; import 'package:firebase_auth/firebase_auth.dart'; import 'package:firebase_storage/firebase_storage.dart'; import 'package:syncfusion_flutter_pdf/pdf.dart'; import '../utils/logger.dart'; /// Service for RAG chunk retrieval from teacher PDFs /// CORRETO: Divide PDFs em chunks e seleciona relevantes por keyword matching class MaterialsRAGService { static final FirebaseFirestore _firestore = FirebaseFirestore.instance; static final FirebaseStorage _storage = FirebaseStorage.instanceFor( bucket: 'teachit-app.firebasestorage.app', ); static final FirebaseAuth _auth = FirebaseAuth.instance; /// Cache de chunks extraídos dos PDFs: {fileName: [chunk1, chunk2, ...]} static final Map> _chunksCache = {}; /// Número máximo de janelas de contexto a enviar ao modelo static const int _maxRelevantChunks = 5; /// Listar materiais disponíveis para o aluno autenticado /// Retorna apenas materiais cujo classId corresponde a uma turma onde o aluno está inscrito static Future>> getAvailableMaterialsForStudent() async { try { final user = _auth.currentUser; if (user == null) return []; final uid = user.uid; // 1. Buscar classIds das inscrições do aluno final enrollmentSnapshot = await _firestore .collection('enrollments') .where('studentId', isEqualTo: uid) .get(); final enrolledClassIds = enrollmentSnapshot.docs .map((doc) => doc.data()['classId'] as String?) .where((id) => id != null) .cast() .toSet(); if (enrolledClassIds.isEmpty) return []; // 2. Buscar teacher IDs dessas turmas final teacherIds = await _getTeacherIdsForStudent(uid); if (teacherIds.isEmpty) return []; // 3. Buscar todos os materiais desses professores final teacherIdList = teacherIds.take(10).toList(); final snapshot = await _firestore .collection('materials') .where('teacherId', whereIn: teacherIdList) .orderBy('createdAt', descending: true) .get(); // 4. Filtrar: manter apenas materiais cujo classId está nas turmas do aluno // ou materiais sem classId (compatibilidade com uploads antigos) final result = >[]; for (final doc in snapshot.docs) { final data = doc.data(); final classId = data['classId'] as String?; if (classId == null || enrolledClassIds.contains(classId)) { final fileName = data['fileName'] as String? ?? 'Material'; final teacherId = data['teacherId'] as String?; final url = data['url'] as String?; result.add({ 'id': doc.id, 'name': fileName, if (classId != null) 'classId': classId, if (teacherId != null) 'teacherId': teacherId, if (url != null) 'url': url, }); } } Logger.info('Available materials for student: ${result.length}'); return result; } catch (e) { Logger.error('Error getting available materials for student: $e'); return []; } } /// RAG CHUNK RETRIEVAL - Versão correta /// Busca chunks relevantes dos PDFs com base na query do usuário /// Se [selectedMaterialIds] for fornecido e não vazio, filtra apenas esses materiais static Future getRelevantChunks({ required String userQuery, int maxMaterials = 5, int maxChunks = 5, List? selectedMaterialIds, }) async { try { final user = _auth.currentUser; if (user == null) { Logger.warning('No authenticated user for materials context'); return ''; } if (selectedMaterialIds != null && selectedMaterialIds.isNotEmpty) { // Usar apenas os materiais selecionados pelo aluno Logger.info('Fetching selected materials: $selectedMaterialIds'); final batches = >[]; for (int i = 0; i < selectedMaterialIds.length; i += 10) { final batch = selectedMaterialIds.skip(i).take(10).toList(); batches.add( _firestore .collection('materials') .where(FieldPath.documentId, whereIn: batch) .get(), ); } final results = await Future.wait(batches); final allDocs = results.expand((s) => s.docs).toList(); Logger.info('Selected materials found: ${allDocs.length}'); // Processar directamente — sem chunking para não triplicar o texto em memória final contextBuffer = StringBuffer(); contextBuffer.writeln('Contexto dos materiais do professor:'); bool hasContent = false; for (final doc in allDocs) { final data = doc.data() as Map; final fileName = data['fileName'] as String?; if (fileName == null) continue; if (!fileName.toLowerCase().endsWith('.pdf')) continue; // Usar cache do texto completo se disponível (sufixo v2 invalida caches antigos) final cacheKey = '${fileName}_v6'; String fullText; if (_chunksCache.containsKey(cacheKey) && _chunksCache[cacheKey]!.isNotEmpty) { fullText = _chunksCache[cacheKey]!.first; Logger.info( 'Using cached text for $fileName: ${fullText.length} chars', ); } else { try { final teacherId = data['teacherId'] as String?; if (teacherId == null) continue; final rawText = await _extractFullText(fileName, teacherId); if (rawText.isEmpty) continue; // Colapsar whitespace excessivo (PDFs de layout decorativo geram muitos \n) String cleaned = rawText .replaceAll(RegExp(r'[ \t]+'), ' ') .replaceAll(RegExp(r'\n{2,}'), '\n') .trim(); // Tentar corrigir encoding LaTeX corrompido (Type1/OTF sem mapeamento Unicode) cleaned = cleaned .replaceAll('¸c˜ao', 'ção') .replaceAll('˜ao', 'ão') .replaceAll('¸c˜oes', 'ções') .replaceAll('˜oes', 'ões') .replaceAll('¸c', 'ç') .replaceAll('´a', 'á') .replaceAll('´e', 'é') .replaceAll('´i', 'í') .replaceAll('´o', 'ó') .replaceAll('´u', 'ú') .replaceAll('ˆa', 'â') .replaceAll('ˆe', 'ê') .replaceAll('ˆo', 'ô') .replaceAll('`a', 'à'); // Reconstruir espaços em texto colado (LaTeX sem ToUnicode map): // inserir espaço antes de maiúscula precedida de minúscula/dígito cleaned = cleaned.replaceAllMapped( RegExp(r'([a-záéíóúàâêôãõç\d])([A-ZÁÉÍÓÚÀÂÊÔÃÕÇ])'), (m) => '${m.group(1)} ${m.group(2)}', ); // inserir espaço entre dígito e letra cleaned = cleaned.replaceAllMapped( RegExp(r'(\d)([A-Za-záéíóúàâêôãõç])'), (m) => '${m.group(1)} ${m.group(2)}', ); fullText = cleaned; // Guardar texto completo no cache com key versionada _chunksCache[cacheKey] = [fullText]; Logger.info( 'PDF "$fileName" -> ${fullText.length} chars extracted', ); } catch (e) { Logger.error('Error extracting text from $fileName: $e'); continue; } } // PDFs pequenos: enviar texto completo (formulários, notas, etc.) // PDFs grandes: keyword window search para não sobrecarregar o modelo final String context; if (fullText.length <= 10000) { context = fullText; Logger.info( 'Small PDF — sending full text (${fullText.length} chars)', ); } else { final windows = _extractKeywordWindows( fullText, userQuery, _maxRelevantChunks, ); context = windows.join('\n\n---\n\n'); Logger.info('Large PDF — keyword windows: ${windows.length}'); } if (context.isNotEmpty) { contextBuffer.writeln('\n[MATERIAL: $fileName]'); contextBuffer.writeln(context); hasContent = true; } } if (!hasContent) return ''; return contextBuffer.toString(); } // Sem material seleccionado — não processar PDFs automaticamente // O utilizador deve seleccionar um material antes de fazer perguntas sobre conteúdo Logger.info('No selectedMaterialIds — skipping automatic PDF processing'); return ''; } catch (e) { Logger.error('Error in RAG chunk retrieval: $e'); return ''; } } /// Método legacy - mantido para compatibilidade mas usa chunk retrieval @Deprecated('Use getRelevantChunks with userQuery instead') static Future getMaterialsContext({int maxMaterials = 5}) async { return getRelevantChunks( userQuery: '', maxMaterials: maxMaterials, maxChunks: 3, ); } /// Get teacher IDs from student's enrolled classes /// Busca inscrições do estudante e obtém teacherIds das turmas static Future> _getTeacherIdsForStudent(String studentId) async { try { // 1. Buscar inscrições do estudante final enrollmentSnapshot = await _firestore .collection('enrollments') .where('studentId', isEqualTo: studentId) .get(); if (enrollmentSnapshot.docs.isEmpty) { Logger.info('No enrollments found for student $studentId'); return []; } // 2. Extrair classIds das inscrições final classIds = enrollmentSnapshot.docs .map((doc) => doc.data()['classId'] as String?) .where((id) => id != null) .cast() .toList(); if (classIds.isEmpty) { Logger.info('No class IDs found in enrollments'); return []; } Logger.info('Found ${classIds.length} classes for student'); // 3. Buscar turmas e extrair teacherIds final Set teacherIds = {}; // Firestore whereIn limit is 10, so process in batches if needed for (int i = 0; i < classIds.length; i += 10) { final batch = classIds.skip(i).take(10).toList(); final classSnapshot = await _firestore .collection('classes') .where(FieldPath.documentId, whereIn: batch) .get(); for (final doc in classSnapshot.docs) { final teacherId = doc.data()['teacherId'] as String?; if (teacherId != null && teacherId.isNotEmpty) { teacherIds.add(teacherId); } } } Logger.info('Found ${teacherIds.length} unique teachers'); return teacherIds.toList(); } catch (e) { Logger.error('Error getting teacher IDs for student: $e'); return []; } } /// Limite máximo de bytes descarregados do PDF via Firebase Storage (10 MB) static const int _maxPdfBytes = 10 * 1024 * 1024; /// Limite máximo de caracteres de texto extraído do PDF completo (para chunking) static const int _maxExtractedChars = 50000; /// Extrair texto real do PDF usando Firebase Storage SDK + syncfusion_flutter_pdf /// Usa getData() para descarregar o ficheiro completo (sem truncar a meio do stream) static Future _extractFullText( String fileName, String teacherId, ) async { PdfDocument? document; try { final ref = _storage .ref() .child('teachers') .child(teacherId) .child('materials') .child(fileName); Logger.info('PDF available for extraction: $fileName'); // getData descarrega o ficheiro completo de forma gerida pelo SDK do Firebase // O PDF nunca é truncado a meio — recebemos sempre um ficheiro válido final data = await ref.getData(_maxPdfBytes); if (data == null || data.isEmpty) { Logger.warning('No data received for $fileName'); return ''; } Logger.info('Downloaded ${data.length} bytes for $fileName'); // Extrair texto real com PdfDocument document = PdfDocument(inputBytes: data); final buffer = StringBuffer(); // 1. Extrair texto de todas as páginas — salta apenas páginas de estrutura final extractor = PdfTextExtractor(document); final totalPages = document.pages.count; final startPage = totalPages > 4 ? 2 : 0; for (int i = startPage; i < totalPages; i++) { if (buffer.length >= _maxExtractedChars) break; try { final pageText = extractor .extractText(startPageIndex: i, endPageIndex: i) .trim(); if (pageText.length < 80) continue; final lowerText = pageText.toLowerCase(); final pipeCount = '|'.allMatches(pageText).length; final isStructurePage = pipeCount > 3 || (lowerText.contains('table of contents') && pageText.length < 800) || (lowerText.contains('copyright') && pageText.length < 400) || (lowerText.contains('color insert') && pageText.length < 400) || lowerText.contains('just light novels') || lowerText.contains('download all your fav') || (lowerText.contains('www.') && pageText.length < 300); if (isStructurePage) continue; buffer.writeln(pageText); } catch (_) {} } // 2. Extrair valores dos campos de formulário (se existirem) final form = document.form; if (form.fields.count > 0) { buffer.writeln('\n[CAMPOS DO FORMULÁRIO]'); for (int i = 0; i < form.fields.count; i++) { if (buffer.length >= _maxExtractedChars) break; final field = form.fields[i]; final name = field.name; String value = ''; if (field is PdfTextBoxField) { value = field.text; } else if (field is PdfComboBoxField) { value = field.selectedValue; } else if (field is PdfListBoxField) { value = field.selectedValues.join(', '); } else if (field is PdfRadioButtonListField) { value = field.selectedValue; } else if (field is PdfCheckBoxField) { value = field.isChecked ? 'Sim' : 'Não'; } if ((name?.isNotEmpty ?? false) || value.isNotEmpty) { buffer.writeln('$name: $value'); } } } final fullText = buffer.toString(); // Truncar ao limite final result = fullText.length > _maxExtractedChars ? fullText.substring(0, _maxExtractedChars) : fullText; Logger.info( 'Extracted ${result.length} chars from $fileName (${document.pages.count} pages, ${form.fields.count} form fields)', ); Logger.info( 'Text preview: ${result.length > 200 ? result.substring(0, 200) : result}', ); return result.trim(); } catch (e) { Logger.error('Error extracting text from $fileName: $e'); return ''; } finally { document?.dispose(); } } /// Keyword window search — encontra posições das keywords no texto e extrai /// janelas de contexto em redor. Nunca aloca chunks — opera sobre a string original. static List _extractKeywordWindows( String text, String userQuery, int maxWindows, { int windowSize = 1200, }) { if (text.isEmpty || userQuery.isEmpty) { // Sem query — devolver início do texto return [text.length > windowSize ? text.substring(0, windowSize) : text]; } // Extrair keywords: palavras com >3 chars + nomes próprios (palavras com maiúscula, >2 chars) // Os nomes próprios são invariantes entre línguas (ex: "Claire", "Rae", "François") final properNouns = RegExp( r'\b[A-ZÁÉÍÓÚÀÂÊÔÃÕÇ][a-záéíóúàâêôãõç]{2,}\b', ).allMatches(userQuery).map((m) => m.group(0)!.toLowerCase()).toSet(); final generalKeywords = userQuery .toLowerCase() .split(RegExp(r'[^\w]')) .where((w) => w.length > 3) .toSet(); final keywords = {...properNouns, ...generalKeywords}; if (keywords.isEmpty) { return [text.length > windowSize ? text.substring(0, windowSize) : text]; } final textLower = text.toLowerCase(); // Recolher posições únicas onde alguma keyword aparece final positions = {}; for (final kw in keywords) { int idx = textLower.indexOf(kw); while (idx != -1) { positions.add(idx); idx = textLower.indexOf(kw, idx + 1); } } if (positions.isEmpty) { // Sem matches — retornar porção do início do conteúdo real (saltar ~10% de índice/capa) final skip = (text.length * 0.05).toInt().clamp(0, 2000); final end = (skip + windowSize * maxWindows).clamp(0, text.length); return [text.substring(skip, end).trim()]; } // Ordenar posições e fundir janelas sobrepostas final sorted = positions.toList()..sort(); final windows = []; int lastEnd = -1; for (final pos in sorted) { if (windows.length >= maxWindows) break; final start = (pos - windowSize ~/ 2).clamp(0, text.length); final end = (pos + windowSize ~/ 2).clamp(0, text.length); if (start < lastEnd) continue; // Janela sobreposta — saltar windows.add(text.substring(start, end).trim()); lastEnd = end; } Logger.info( 'Keyword windows found: ${windows.length} for query "$userQuery"', ); return windows; } /// Dividir texto em chunks com overlap static List _chunkText(String text, int chunkSize, int overlap) { final List chunks = []; final int textLength = text.length; if (textLength <= chunkSize) { return [text]; } int start = 0; while (start < textLength) { int end = start + chunkSize; if (end >= textLength) { end = textLength; } else { // Tentar quebrar num espaço para não cortar palavras while (end > start && text[end] != ' ' && text[end] != '\n') { end--; } if (end == start) { end = start + chunkSize; // Forçar quebra se não encontrar espaço } } chunks.add(text.substring(start, end).trim()); // Avançar com overlap start = end - overlap; if (start >= end) break; // Prevenir loop infinito } return chunks; } /// Selecionar chunks mais relevantes usando keyword matching simples static List _selectRelevantChunks( List chunks, String userQuery, int maxChunks, ) { if (userQuery.isEmpty || chunks.isEmpty) { // Se não há query, retornar primeiros chunks return chunks.take(maxChunks).toList(); } // Extrair keywords da query (palavras com mais de 3 caracteres) final queryWords = userQuery .toLowerCase() .split(RegExp(r'[^\w]')) .where((w) => w.length > 3) .toSet(); if (queryWords.isEmpty) { return chunks.take(maxChunks).toList(); } // Calcular score para cada chunk final List> scoredChunks = []; for (final chunk in chunks) { final chunkLower = chunk.toLowerCase(); int score = 0; for (final word in queryWords) { // Contar ocorrências da palavra no chunk final matches = word.allMatches(chunkLower).length; score += matches * 10; // Peso por ocorrência // Bonus se a palavra estiver no início do chunk if (chunkLower.startsWith(word)) { score += 5; } } // Bonus por tamanho do chunk (preferir chunks mais completos) score += (chunk.length / 100).floor(); scoredChunks.add(MapEntry(chunk, score)); } // Ordenar por score decrescente scoredChunks.sort((a, b) => b.value.compareTo(a.value)); Logger.info( 'Top chunk scores: ${scoredChunks.take(3).map((e) => e.value).toList()}', ); // Retornar os N chunks mais relevantes return scoredChunks.take(maxChunks).map((e) => e.key).toList(); } /// Formatar texto extraído do PDF para melhor legibilidade static String _formatPDFText(String text) { if (text.isEmpty) return text; String formatted = text; // Corrigir quebras de linha excessivas formatted = formatted.replaceAll(RegExp(r'\n{3,}'), '\n\n'); // Corrigir espaços excessivos formatted = formatted.replaceAll(RegExp(r'[ \t]+'), ' '); // Remover espaços no início/fim das linhas formatted = formatted.split('\n').map((line) => line.trim()).join('\n'); // Corrigir parágrafos (linhas que terminam com ponto e seguem sem espaço) formatted = formatted.replaceAllMapped( RegExp(r'\.(\n)([A-ZÁÉÍÓÚÀÂÊÔÃÕÇ])'), (match) => '.\n\n${match.group(2)}', ); // Corrigir quebras de palavras com hífen no fim da linha formatted = formatted.replaceAllMapped( RegExp(r'([a-zA-Záéíóúàâêôãõç])-\n([a-zA-Záéíóúàâêôãõç])'), (match) => '${match.group(1)}${match.group(2)}', ); // Adicionar quebras de parágrafo para títulos (linhas em maiúsculas) formatted = formatted.replaceAllMapped( RegExp(r'\n([A-ZÁÉÍÓÚÀÂÊÔÃÕÇ][A-ZÁÉÍÓÚÀÂÊÔÃÕÇ\s]{10,})\n'), (match) => '\n\n${match.group(1)}\n\n', ); // Limpar quebras de linha no início e fim formatted = formatted.trim(); return formatted; } /// Obter o texto completo de um PDF específico para pré-visualização static Future getFullPDFText( String fileName, String teacherId, ) async { try { // Remover extensão se existir final cleanFileName = fileName.endsWith('.pdf') ? fileName : '$fileName.pdf'; // Usar cache do texto completo se disponível final cacheKey = '${cleanFileName}_preview_v6'; if (_chunksCache.containsKey(cacheKey) && _chunksCache[cacheKey]!.isNotEmpty) { final fullText = _chunksCache[cacheKey]!.first; Logger.info( 'Using cached preview text for $cleanFileName: ${fullText.length} chars', ); return fullText; } // Extrair texto completo final rawText = await _extractFullText(cleanFileName, teacherId); // Formatar texto para melhor legibilidade final formattedText = _formatPDFText(rawText); // Guardar em cache _chunksCache[cacheKey] = [formattedText]; Logger.info( 'PDF "$cleanFileName" -> ${formattedText.length} chars extracted and formatted for preview', ); return formattedText; } catch (e) { Logger.error('Error getting full PDF text for $fileName: $e'); return ''; } } /// Clear the chunks cache static void clearCache() { _chunksCache.clear(); Logger.info('Materials chunks cache cleared'); } }