359 lines
11 KiB
Dart
359 lines
11 KiB
Dart
import 'dart:math';
|
|
import 'package:cloud_firestore/cloud_firestore.dart';
|
|
import '../models/content_chunk.dart';
|
|
import '../utils/logger.dart';
|
|
|
|
/// Service for vector embeddings and similarity search
|
|
class VectorService {
|
|
static final FirebaseFirestore _firestore = FirebaseFirestore.instance;
|
|
|
|
/// Generate embedding for text (mock implementation - in production would use real embedding model)
|
|
static List<double> generateEmbedding(String text) {
|
|
try {
|
|
Logger.info('Generating embedding for text of length: ${text.length}');
|
|
|
|
// Mock embedding generation - in production would use OpenAI, Cohere, or local model
|
|
// This creates a deterministic embedding based on text content
|
|
final hash = text.hashCode;
|
|
final random = Random(hash.abs());
|
|
|
|
// Generate 384-dimensional embedding with semantic similarity
|
|
final embedding = List.generate(384, (i) {
|
|
// Create deterministic values based on text hash and position
|
|
final seed = (hash * (i + 1)) % 1000;
|
|
final value = (seed / 1000.0 - 0.5) * 2.0;
|
|
|
|
// Add some semantic similarity for common words
|
|
double semanticBoost = 0.0;
|
|
final textLower = text.toLowerCase();
|
|
|
|
// Boost for common educational terms
|
|
if (textLower.contains('fotossíntese') ||
|
|
textLower.contains('plantas')) {
|
|
semanticBoost += 0.3 * (i % 10) / 10.0;
|
|
}
|
|
if (textLower.contains('energia') || textLower.contains('luz')) {
|
|
semanticBoost += 0.2 * (i % 8) / 8.0;
|
|
}
|
|
if (textLower.contains('biologia') || textLower.contains('processo')) {
|
|
semanticBoost += 0.1 * (i % 12) / 12.0;
|
|
}
|
|
|
|
return value + semanticBoost;
|
|
});
|
|
|
|
// Normalize the vector
|
|
final norm = sqrt(embedding.map((x) => x * x).reduce((a, b) => a + b));
|
|
return embedding.map((x) => x / norm).toList();
|
|
} catch (e) {
|
|
Logger.error('Error generating embedding: $e');
|
|
// Return zero vector as fallback
|
|
return List.filled(384, 0.0);
|
|
}
|
|
}
|
|
|
|
/// Calculate cosine similarity between two vectors
|
|
static double cosineSimilarity(List<double> vec1, List<double> vec2) {
|
|
if (vec1.length != vec2.length) {
|
|
throw ArgumentError('Vectors must be of same length');
|
|
}
|
|
|
|
double dotProduct = 0.0;
|
|
double norm1 = 0.0;
|
|
double norm2 = 0.0;
|
|
|
|
for (int i = 0; i < vec1.length; i++) {
|
|
dotProduct += vec1[i] * vec2[i];
|
|
norm1 += vec1[i] * vec1[i];
|
|
norm2 += vec2[i] * vec2[i];
|
|
}
|
|
|
|
if (norm1 == 0 || norm2 == 0) return 0.0;
|
|
|
|
return dotProduct / (sqrt(norm1) * sqrt(norm2));
|
|
}
|
|
|
|
/// Search for similar content chunks
|
|
static Future<List<ContentChunk>> searchSimilar({
|
|
required List<double> queryEmbedding,
|
|
String? subject,
|
|
String? concept,
|
|
int? grade,
|
|
double? minDifficulty,
|
|
double? maxDifficulty,
|
|
int k = 5,
|
|
double threshold = 0.3,
|
|
}) async {
|
|
try {
|
|
Logger.info(
|
|
'Searching for similar content with k=$k, threshold=$threshold',
|
|
);
|
|
|
|
Query query = _firestore
|
|
.collection('contentChunks')
|
|
.where('isActive', isEqualTo: true)
|
|
.limit(100); // Get more candidates for better filtering
|
|
|
|
// Apply filters
|
|
if (subject != null) {
|
|
query = query.where('subject', isEqualTo: subject);
|
|
}
|
|
if (concept != null) {
|
|
query = query.where('concept', isEqualTo: concept);
|
|
}
|
|
if (grade != null) {
|
|
query = query.where('grade', isEqualTo: grade);
|
|
}
|
|
if (minDifficulty != null) {
|
|
query = query.where(
|
|
'difficulty',
|
|
isGreaterThanOrEqualTo: minDifficulty,
|
|
);
|
|
}
|
|
if (maxDifficulty != null) {
|
|
query = query.where('difficulty', isLessThanOrEqualTo: maxDifficulty);
|
|
}
|
|
|
|
final querySnapshot = await query.get();
|
|
|
|
// Calculate similarities and sort
|
|
final scoredChunks = <ContentChunk, double>{};
|
|
|
|
for (final doc in querySnapshot.docs) {
|
|
final chunk = ContentChunk.fromFirestore(
|
|
doc.data() as Map<String, dynamic>,
|
|
doc.id,
|
|
);
|
|
final similarity = cosineSimilarity(queryEmbedding, chunk.embedding);
|
|
|
|
if (similarity >= threshold) {
|
|
scoredChunks[chunk] = similarity;
|
|
}
|
|
}
|
|
|
|
// Sort by similarity and take top k
|
|
final sortedChunks = scoredChunks.entries.toList()
|
|
..sort((a, b) => b.value.compareTo(a.value));
|
|
|
|
return sortedChunks.take(k).map((entry) => entry.key).toList();
|
|
} catch (e) {
|
|
Logger.error('Error searching similar content: $e');
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/// Search by text query (generates embedding and searches)
|
|
static Future<List<ContentChunk>> searchByText({
|
|
required String query,
|
|
String? subject,
|
|
String? concept,
|
|
int? grade,
|
|
double? minDifficulty,
|
|
double? maxDifficulty,
|
|
int k = 5,
|
|
}) async {
|
|
try {
|
|
Logger.info('Searching by text: "${query.substring(0, 50)}..."');
|
|
|
|
// Generate embedding for query
|
|
final queryEmbedding = generateEmbedding(query);
|
|
|
|
// Search for similar content
|
|
return await searchSimilar(
|
|
queryEmbedding: queryEmbedding,
|
|
subject: subject,
|
|
concept: concept,
|
|
grade: grade,
|
|
minDifficulty: minDifficulty,
|
|
maxDifficulty: maxDifficulty,
|
|
k: k,
|
|
);
|
|
} catch (e) {
|
|
Logger.error('Error searching by text: $e');
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/// Batch generate embeddings for multiple texts
|
|
static Future<List<List<double>>> batchGenerateEmbeddings(
|
|
List<String> texts,
|
|
) async {
|
|
try {
|
|
Logger.info('Generating embeddings for ${texts.length} texts');
|
|
|
|
final embeddings = <List<double>>[];
|
|
|
|
for (final text in texts) {
|
|
final embedding = generateEmbedding(text);
|
|
embeddings.add(embedding);
|
|
}
|
|
|
|
return embeddings;
|
|
} catch (e) {
|
|
Logger.error('Error generating batch embeddings: $e');
|
|
return List.filled(texts.length, List.filled(384, 0.0));
|
|
}
|
|
}
|
|
|
|
/// Update embedding for a content chunk
|
|
static Future<void> updateChunkEmbedding(String chunkId, String text) async {
|
|
try {
|
|
Logger.info('Updating embedding for chunk: $chunkId');
|
|
|
|
final embedding = generateEmbedding(text);
|
|
|
|
await _firestore.collection('contentChunks').doc(chunkId).update({
|
|
'embedding': embedding,
|
|
'lastUpdated': FieldValue.serverTimestamp(),
|
|
});
|
|
|
|
Logger.info('Embedding updated for chunk: $chunkId');
|
|
} catch (e) {
|
|
Logger.error('Error updating chunk embedding: $e');
|
|
throw Exception('Failed to update chunk embedding: $e');
|
|
}
|
|
}
|
|
|
|
/// Get content chunks for a specific content
|
|
static Future<List<ContentChunk>> getContentChunks(String contentId) async {
|
|
try {
|
|
Logger.info('Getting chunks for content: $contentId');
|
|
|
|
final querySnapshot = await _firestore
|
|
.collection('contentChunks')
|
|
.where('contentId', isEqualTo: contentId)
|
|
.where('isActive', isEqualTo: true)
|
|
.orderBy('createdAt')
|
|
.get();
|
|
|
|
return querySnapshot.docs
|
|
.map((doc) => ContentChunk.fromFirestore(doc.data(), doc.id))
|
|
.toList();
|
|
} catch (e) {
|
|
Logger.error('Error getting content chunks: $e');
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/// Create content chunk with embedding
|
|
static Future<String> createContentChunk({
|
|
required String contentId,
|
|
required String text,
|
|
required String subject,
|
|
required String concept,
|
|
String? subConcept,
|
|
required String unit,
|
|
required double difficulty,
|
|
required int grade,
|
|
required String sourceDocument,
|
|
Map<String, dynamic>? metadata,
|
|
int? pageNumber,
|
|
String? section,
|
|
}) async {
|
|
try {
|
|
Logger.info('Creating content chunk for: $concept');
|
|
|
|
// Generate embedding
|
|
final embedding = generateEmbedding(text);
|
|
|
|
// Create chunk document
|
|
final chunkData = {
|
|
'contentId': contentId,
|
|
'text': text,
|
|
'subject': subject,
|
|
'concept': concept,
|
|
if (subConcept != null) 'subConcept': subConcept,
|
|
'unit': unit,
|
|
'difficulty': difficulty,
|
|
'grade': grade,
|
|
'embedding': embedding,
|
|
'sourceDocument': sourceDocument,
|
|
'metadata': metadata ?? {},
|
|
'createdAt': FieldValue.serverTimestamp(),
|
|
'isActive': true,
|
|
if (pageNumber != null) 'pageNumber': pageNumber,
|
|
if (section != null) 'section': section,
|
|
};
|
|
|
|
final docRef = await _firestore
|
|
.collection('contentChunks')
|
|
.add(chunkData);
|
|
final chunkId = docRef.id;
|
|
|
|
Logger.info('Content chunk created: $chunkId');
|
|
return chunkId;
|
|
} catch (e) {
|
|
Logger.error('Error creating content chunk: $e');
|
|
throw Exception('Failed to create content chunk: $e');
|
|
}
|
|
}
|
|
|
|
/// Delete content chunks for a content
|
|
static Future<void> deleteContentChunks(String contentId) async {
|
|
try {
|
|
Logger.info('Deleting chunks for content: $contentId');
|
|
|
|
final querySnapshot = await _firestore
|
|
.collection('contentChunks')
|
|
.where('contentId', isEqualTo: contentId)
|
|
.get();
|
|
|
|
final batch = _firestore.batch();
|
|
for (final doc in querySnapshot.docs) {
|
|
batch.delete(doc.reference);
|
|
}
|
|
|
|
await batch.commit();
|
|
Logger.info('Content chunks deleted: ${querySnapshot.docs.length}');
|
|
} catch (e) {
|
|
Logger.error('Error deleting content chunks: $e');
|
|
throw Exception('Failed to delete content chunks: $e');
|
|
}
|
|
}
|
|
|
|
/// Get vector statistics
|
|
static Future<Map<String, dynamic>> getVectorStats() async {
|
|
try {
|
|
Logger.info('Getting vector statistics');
|
|
|
|
final querySnapshot = await _firestore
|
|
.collection('contentChunks')
|
|
.where('isActive', isEqualTo: true)
|
|
.get();
|
|
|
|
final totalChunks = querySnapshot.docs.length;
|
|
final subjects = <String, int>{};
|
|
final concepts = <String, int>{};
|
|
final grades = <int, int>{};
|
|
|
|
for (final doc in querySnapshot.docs) {
|
|
final data = doc.data();
|
|
final subject = data['subject'] as String? ?? 'Unknown';
|
|
final concept = data['concept'] as String? ?? 'Unknown';
|
|
final grade = data['grade'] as int? ?? 0;
|
|
|
|
subjects[subject] = (subjects[subject] ?? 0) + 1;
|
|
concepts[concept] = (concepts[concept] ?? 0) + 1;
|
|
grades[grade] = (grades[grade] ?? 0) + 1;
|
|
}
|
|
|
|
return {
|
|
'totalChunks': totalChunks,
|
|
'subjects': subjects,
|
|
'concepts': concepts,
|
|
'grades': grades,
|
|
'embeddingDimension': 384,
|
|
};
|
|
} catch (e) {
|
|
Logger.error('Error getting vector stats: $e');
|
|
return {
|
|
'totalChunks': 0,
|
|
'subjects': <String, int>{},
|
|
'concepts': <String, int>{},
|
|
'grades': <int, int>{},
|
|
'embeddingDimension': 384,
|
|
};
|
|
}
|
|
}
|
|
}
|