import { describe, it, expect, beforeEach, afterAll } from 'vitest'; import BetterSqlite3 from 'better-sqlite3'; import { Kysely, SqliteDialect } from 'kysely'; import { DatabaseDedup } from './dedup.js'; import { migrate, reset } from '../../infrastructure/db/schema.js'; import type { Database } from '../../infrastructure/db/database.js'; import type { FeedItem } from '../../interfaces/feed.types.js'; describe('DatabaseDedup', () => { let sqliteDb: BetterSqlite3.Database; let db: Kysely; let dedup: DatabaseDedup; beforeEach(async () => { // Create in-memory database for each test sqliteDb = new BetterSqlite3(':memory:'); sqliteDb.pragma('journal_mode = WAL'); db = new Kysely({ dialect: new SqliteDialect({ database: sqliteDb, }), }); // Reset and migrate await reset(db); await migrate(db); dedup = new DatabaseDedup(db); }); afterAll(async () => { await db.destroy(); }); describe('filter', () => { it('returns all items when nothing is marked seen', async () => { const items: FeedItem[] = [ { id: 'item1', source: 'https://example.com/feed.xml', title: 'Article 1', url: 'https://example.com/1', publishedAt: new Date('2024-09-06T09:00:00Z'), }, { id: 'item2', source: 'https://example.com/feed.xml', title: 'Article 2', url: 'https://example.com/2', publishedAt: new Date('2024-09-06T10:00:00Z'), }, ]; const filtered = await dedup.filter(items); expect(filtered).toHaveLength(2); expect(filtered[0].id).toBe('item1'); expect(filtered[1].id).toBe('item2'); }); it('excludes items that have been marked seen', async () => { const items: FeedItem[] = [ { id: 'item1', source: 'https://example.com/feed.xml', title: 'Article 1', url: 'https://example.com/1', publishedAt: new Date('2024-09-06T09:00:00Z'), }, { id: 'item2', source: 'https://example.com/feed.xml', title: 'Article 2', url: 'https://example.com/2', publishedAt: new Date('2024-09-06T10:00:00Z'), }, ]; // Mark first item as seen await dedup.markSeen([items[0]]); const filtered = await dedup.filter(items); expect(filtered).toHaveLength(1); expect(filtered[0].id).toBe('item2'); }); it('returns empty array when all items are seen', async () => { const items: FeedItem[] = [ { id: 'item1', source: 'https://example.com/feed.xml', title: 'Article 1', url: 'https://example.com/1', publishedAt: new Date('2024-09-06T09:00:00Z'), }, ]; await dedup.markSeen(items); const filtered = await dedup.filter(items); expect(filtered).toHaveLength(0); }); it('returns empty array for empty input', async () => { const filtered = await dedup.filter([]); expect(filtered).toHaveLength(0); }); it('handles partial matches correctly', async () => { const items: FeedItem[] = [ { id: 'seen-item', source: 'https://example.com/feed.xml', title: 'Seen Article', url: 'https://example.com/seen', publishedAt: new Date('2024-09-06T09:00:00Z'), }, { id: 'new-item', source: 'https://example.com/feed.xml', title: 'New Article', url: 'https://example.com/new', publishedAt: new Date('2024-09-06T10:00:00Z'), }, { id: 'another-seen', source: 'https://example.com/feed.xml', title: 'Another Seen', url: 'https://example.com/another', publishedAt: new Date('2024-09-06T11:00:00Z'), }, ]; await dedup.markSeen([items[0], items[2]]); const filtered = await dedup.filter(items); expect(filtered).toHaveLength(1); expect(filtered[0].id).toBe('new-item'); }); }); describe('markSeen', () => { it('marks items as seen', async () => { const items: FeedItem[] = [ { id: 'item1', source: 'https://example.com/feed.xml', title: 'Article 1', url: 'https://example.com/1', publishedAt: new Date('2024-09-06T09:00:00Z'), }, ]; await dedup.markSeen(items); const filtered = await dedup.filter(items); expect(filtered).toHaveLength(0); }); it('marks multiple items at once', async () => { const items: FeedItem[] = [ { id: 'item1', source: 'https://example.com/feed.xml', title: 'Article 1', url: 'https://example.com/1', publishedAt: new Date('2024-09-06T09:00:00Z'), }, { id: 'item2', source: 'https://example.com/feed.xml', title: 'Article 2', url: 'https://example.com/2', publishedAt: new Date('2024-09-06T10:00:00Z'), }, ]; await dedup.markSeen(items); const filtered = await dedup.filter(items); expect(filtered).toHaveLength(0); }); it('handles empty array gracefully', async () => { await dedup.markSeen([]); // Should not throw }); it('is idempotent - marking same item twice does not error', async () => { const item: FeedItem = { id: 'duplicate-id', source: 'https://example.com/feed.xml', title: 'Article', url: 'https://example.com/article', publishedAt: new Date('2024-09-06T09:00:00Z'), }; await dedup.markSeen([item]); await dedup.markSeen([item]); // Should not throw const filtered = await dedup.filter([item]); expect(filtered).toHaveLength(0); }); it('marks items incrementally', async () => { const item1: FeedItem = { id: 'item1', source: 'https://example.com/feed.xml', title: 'Article 1', url: 'https://example.com/1', publishedAt: new Date('2024-09-06T09:00:00Z'), }; const item2: FeedItem = { id: 'item2', source: 'https://example.com/feed.xml', title: 'Article 2', url: 'https://example.com/2', publishedAt: new Date('2024-09-06T10:00:00Z'), }; // Mark first item await dedup.markSeen([item1]); let filtered = await dedup.filter([item1, item2]); expect(filtered).toHaveLength(1); expect(filtered[0].id).toBe('item2'); // Mark second item await dedup.markSeen([item2]); filtered = await dedup.filter([item1, item2]); expect(filtered).toHaveLength(0); }); }); describe('integration scenarios', () => { it('end-to-end: filter then mark workflow', async () => { const items: FeedItem[] = [ { id: 'new1', source: 'https://example.com/feed.xml', title: 'New Article 1', url: 'https://example.com/new1', publishedAt: new Date('2024-09-06T09:00:00Z'), }, { id: 'new2', source: 'https://example.com/feed.xml', title: 'New Article 2', url: 'https://example.com/new2', publishedAt: new Date('2024-09-06T10:00:00Z'), }, ]; // Simulate feed fetch const newItems = await dedup.filter(items); expect(newItems).toHaveLength(2); // Mark as seen after display await dedup.markSeen(newItems); // Next fetch should return empty const nextFetch = await dedup.filter(items); expect(nextFetch).toHaveLength(0); }); it('handles items with same IDs from different sources', async () => { // This shouldn't happen with proper ID generation, but test it anyway const items: FeedItem[] = [ { id: 'same-hash-id', source: 'https://source1.com/feed.xml', title: 'Article from Source 1', url: 'https://source1.com/article', publishedAt: new Date('2024-09-06T09:00:00Z'), }, { id: 'same-hash-id', source: 'https://source2.com/feed.xml', title: 'Article from Source 2', url: 'https://source2.com/article', publishedAt: new Date('2024-09-06T09:00:00Z'), }, ]; // Mark first as seen await dedup.markSeen([items[0]]); // Both should be filtered since they have same ID const filtered = await dedup.filter(items); expect(filtered).toHaveLength(0); }); it('preserves item data integrity', async () => { const item: FeedItem = { id: 'complete-item', source: 'https://example.com/feed.xml', title: 'Complete Article', url: 'https://example.com/complete', publishedAt: new Date('2024-09-06T09:30:00Z'), summary: 'A summary', content: 'Full content', }; // Mark as seen await dedup.markSeen([item]); // Filter should exclude the item const filtered = await dedup.filter([item]); expect(filtered).toHaveLength(0); }); it('handles large batches efficiently', async () => { // Create 100 items const items: FeedItem[] = Array.from({ length: 100 }, (_, i) => ({ id: `batch-item-${i}`, source: 'https://example.com/feed.xml', title: `Article ${i}`, url: `https://example.com/${i}`, publishedAt: new Date(`2024-09-${String(i + 1).padStart(2, '0')}T09:00:00Z`), })); // All should be returned initially const filtered = await dedup.filter(items); expect(filtered).toHaveLength(100); // Mark half as seen const seenItems = items.slice(0, 50); await dedup.markSeen(seenItems); // Only unseen items should be returned const filteredAgain = await dedup.filter(items); expect(filteredAgain).toHaveLength(50); expect(filteredAgain[0].id).toBe('batch-item-50'); }); }); describe('edge cases', () => { it('handles items with special characters in IDs', async () => { const items: FeedItem[] = [ { id: 'item-with-::special::chars', source: 'https://example.com/feed.xml', title: 'Special ID Article', url: 'https://example.com/special', publishedAt: new Date('2024-09-06T09:00:00Z'), }, ]; await dedup.markSeen(items); const filtered = await dedup.filter(items); expect(filtered).toHaveLength(0); }); it('handles very long IDs', async () => { const longId = 'a'.repeat(64); // Max length per schema const items: FeedItem[] = [ { id: longId, source: 'https://example.com/feed.xml', title: 'Long ID Article', url: 'https://example.com/long', publishedAt: new Date('2024-09-06T09:00:00Z'), }, ]; await dedup.markSeen(items); const filtered = await dedup.filter(items); expect(filtered).toHaveLength(0); }); it('handles duplicate IDs in single filter call', async () => { const item: FeedItem = { id: 'duplicate-in-input', source: 'https://example.com/feed.xml', title: 'Article', url: 'https://example.com/article', publishedAt: new Date('2024-09-06T09:00:00Z'), }; // Same item twice in input const items = [item, item]; // Filter returns both since neither has been marked seen // (filter only removes items from seen_ids table, not deduplicates input) const filtered = await dedup.filter(items); expect(filtered).toHaveLength(2); // Mark one as seen await dedup.markSeen([item]); // Now both are filtered since they share the same ID const filteredAgain = await dedup.filter(items); expect(filteredAgain).toHaveLength(0); }); }); });