pulse/modules/dedup/dedup.test.ts
Edo Limburg 40ccbbad1a Add storage, dedup modules and infrastructure configuration
- Add storage module with SQLite persistence via better-sqlite3
- Add deduplication module for feed item dedup
- Add infrastructure directory for deployment config
- Add .env.example for environment variables
- Update dependencies: kysely, better-sqlite3, pg
2026-05-05 21:59:50 +02:00

401 lines
12 KiB
TypeScript

import { describe, it, expect, beforeEach, afterAll } from 'vitest';
import BetterSqlite3 from 'better-sqlite3';
import { Kysely, SqliteDialect } from 'kysely';
import { DatabaseDedup } from './dedup.js';
import { migrate, reset } from '../../infrastructure/db/schema.js';
import type { Database } from '../../infrastructure/db/database.js';
import type { FeedItem } from '../../interfaces/feed.types.js';
describe('DatabaseDedup', () => {
let sqliteDb: BetterSqlite3.Database;
let db: Kysely<Database>;
let dedup: DatabaseDedup;
beforeEach(async () => {
// Create in-memory database for each test
sqliteDb = new BetterSqlite3(':memory:');
sqliteDb.pragma('journal_mode = WAL');
db = new Kysely<Database>({
dialect: new SqliteDialect({
database: sqliteDb,
}),
});
// Reset and migrate
await reset(db);
await migrate(db);
dedup = new DatabaseDedup(db);
});
afterAll(async () => {
await db.destroy();
});
describe('filter', () => {
it('returns all items when nothing is marked seen', async () => {
const items: FeedItem[] = [
{
id: 'item1',
source: 'https://example.com/feed.xml',
title: 'Article 1',
url: 'https://example.com/1',
publishedAt: new Date('2024-09-06T09:00:00Z'),
},
{
id: 'item2',
source: 'https://example.com/feed.xml',
title: 'Article 2',
url: 'https://example.com/2',
publishedAt: new Date('2024-09-06T10:00:00Z'),
},
];
const filtered = await dedup.filter(items);
expect(filtered).toHaveLength(2);
expect(filtered[0].id).toBe('item1');
expect(filtered[1].id).toBe('item2');
});
it('excludes items that have been marked seen', async () => {
const items: FeedItem[] = [
{
id: 'item1',
source: 'https://example.com/feed.xml',
title: 'Article 1',
url: 'https://example.com/1',
publishedAt: new Date('2024-09-06T09:00:00Z'),
},
{
id: 'item2',
source: 'https://example.com/feed.xml',
title: 'Article 2',
url: 'https://example.com/2',
publishedAt: new Date('2024-09-06T10:00:00Z'),
},
];
// Mark first item as seen
await dedup.markSeen([items[0]]);
const filtered = await dedup.filter(items);
expect(filtered).toHaveLength(1);
expect(filtered[0].id).toBe('item2');
});
it('returns empty array when all items are seen', async () => {
const items: FeedItem[] = [
{
id: 'item1',
source: 'https://example.com/feed.xml',
title: 'Article 1',
url: 'https://example.com/1',
publishedAt: new Date('2024-09-06T09:00:00Z'),
},
];
await dedup.markSeen(items);
const filtered = await dedup.filter(items);
expect(filtered).toHaveLength(0);
});
it('returns empty array for empty input', async () => {
const filtered = await dedup.filter([]);
expect(filtered).toHaveLength(0);
});
it('handles partial matches correctly', async () => {
const items: FeedItem[] = [
{
id: 'seen-item',
source: 'https://example.com/feed.xml',
title: 'Seen Article',
url: 'https://example.com/seen',
publishedAt: new Date('2024-09-06T09:00:00Z'),
},
{
id: 'new-item',
source: 'https://example.com/feed.xml',
title: 'New Article',
url: 'https://example.com/new',
publishedAt: new Date('2024-09-06T10:00:00Z'),
},
{
id: 'another-seen',
source: 'https://example.com/feed.xml',
title: 'Another Seen',
url: 'https://example.com/another',
publishedAt: new Date('2024-09-06T11:00:00Z'),
},
];
await dedup.markSeen([items[0], items[2]]);
const filtered = await dedup.filter(items);
expect(filtered).toHaveLength(1);
expect(filtered[0].id).toBe('new-item');
});
});
describe('markSeen', () => {
it('marks items as seen', async () => {
const items: FeedItem[] = [
{
id: 'item1',
source: 'https://example.com/feed.xml',
title: 'Article 1',
url: 'https://example.com/1',
publishedAt: new Date('2024-09-06T09:00:00Z'),
},
];
await dedup.markSeen(items);
const filtered = await dedup.filter(items);
expect(filtered).toHaveLength(0);
});
it('marks multiple items at once', async () => {
const items: FeedItem[] = [
{
id: 'item1',
source: 'https://example.com/feed.xml',
title: 'Article 1',
url: 'https://example.com/1',
publishedAt: new Date('2024-09-06T09:00:00Z'),
},
{
id: 'item2',
source: 'https://example.com/feed.xml',
title: 'Article 2',
url: 'https://example.com/2',
publishedAt: new Date('2024-09-06T10:00:00Z'),
},
];
await dedup.markSeen(items);
const filtered = await dedup.filter(items);
expect(filtered).toHaveLength(0);
});
it('handles empty array gracefully', async () => {
await dedup.markSeen([]);
// Should not throw
});
it('is idempotent - marking same item twice does not error', async () => {
const item: FeedItem = {
id: 'duplicate-id',
source: 'https://example.com/feed.xml',
title: 'Article',
url: 'https://example.com/article',
publishedAt: new Date('2024-09-06T09:00:00Z'),
};
await dedup.markSeen([item]);
await dedup.markSeen([item]); // Should not throw
const filtered = await dedup.filter([item]);
expect(filtered).toHaveLength(0);
});
it('marks items incrementally', async () => {
const item1: FeedItem = {
id: 'item1',
source: 'https://example.com/feed.xml',
title: 'Article 1',
url: 'https://example.com/1',
publishedAt: new Date('2024-09-06T09:00:00Z'),
};
const item2: FeedItem = {
id: 'item2',
source: 'https://example.com/feed.xml',
title: 'Article 2',
url: 'https://example.com/2',
publishedAt: new Date('2024-09-06T10:00:00Z'),
};
// Mark first item
await dedup.markSeen([item1]);
let filtered = await dedup.filter([item1, item2]);
expect(filtered).toHaveLength(1);
expect(filtered[0].id).toBe('item2');
// Mark second item
await dedup.markSeen([item2]);
filtered = await dedup.filter([item1, item2]);
expect(filtered).toHaveLength(0);
});
});
describe('integration scenarios', () => {
it('end-to-end: filter then mark workflow', async () => {
const items: FeedItem[] = [
{
id: 'new1',
source: 'https://example.com/feed.xml',
title: 'New Article 1',
url: 'https://example.com/new1',
publishedAt: new Date('2024-09-06T09:00:00Z'),
},
{
id: 'new2',
source: 'https://example.com/feed.xml',
title: 'New Article 2',
url: 'https://example.com/new2',
publishedAt: new Date('2024-09-06T10:00:00Z'),
},
];
// Simulate feed fetch
const newItems = await dedup.filter(items);
expect(newItems).toHaveLength(2);
// Mark as seen after display
await dedup.markSeen(newItems);
// Next fetch should return empty
const nextFetch = await dedup.filter(items);
expect(nextFetch).toHaveLength(0);
});
it('handles items with same IDs from different sources', async () => {
// This shouldn't happen with proper ID generation, but test it anyway
const items: FeedItem[] = [
{
id: 'same-hash-id',
source: 'https://source1.com/feed.xml',
title: 'Article from Source 1',
url: 'https://source1.com/article',
publishedAt: new Date('2024-09-06T09:00:00Z'),
},
{
id: 'same-hash-id',
source: 'https://source2.com/feed.xml',
title: 'Article from Source 2',
url: 'https://source2.com/article',
publishedAt: new Date('2024-09-06T09:00:00Z'),
},
];
// Mark first as seen
await dedup.markSeen([items[0]]);
// Both should be filtered since they have same ID
const filtered = await dedup.filter(items);
expect(filtered).toHaveLength(0);
});
it('preserves item data integrity', async () => {
const item: FeedItem = {
id: 'complete-item',
source: 'https://example.com/feed.xml',
title: 'Complete Article',
url: 'https://example.com/complete',
publishedAt: new Date('2024-09-06T09:30:00Z'),
summary: 'A summary',
content: 'Full content',
};
// Mark as seen
await dedup.markSeen([item]);
// Filter should exclude the item
const filtered = await dedup.filter([item]);
expect(filtered).toHaveLength(0);
});
it('handles large batches efficiently', async () => {
// Create 100 items
const items: FeedItem[] = Array.from({ length: 100 }, (_, i) => ({
id: `batch-item-${i}`,
source: 'https://example.com/feed.xml',
title: `Article ${i}`,
url: `https://example.com/${i}`,
publishedAt: new Date(`2024-09-${String(i + 1).padStart(2, '0')}T09:00:00Z`),
}));
// All should be returned initially
const filtered = await dedup.filter(items);
expect(filtered).toHaveLength(100);
// Mark half as seen
const seenItems = items.slice(0, 50);
await dedup.markSeen(seenItems);
// Only unseen items should be returned
const filteredAgain = await dedup.filter(items);
expect(filteredAgain).toHaveLength(50);
expect(filteredAgain[0].id).toBe('batch-item-50');
});
});
describe('edge cases', () => {
it('handles items with special characters in IDs', async () => {
const items: FeedItem[] = [
{
id: 'item-with-::special::chars',
source: 'https://example.com/feed.xml',
title: 'Special ID Article',
url: 'https://example.com/special',
publishedAt: new Date('2024-09-06T09:00:00Z'),
},
];
await dedup.markSeen(items);
const filtered = await dedup.filter(items);
expect(filtered).toHaveLength(0);
});
it('handles very long IDs', async () => {
const longId = 'a'.repeat(64); // Max length per schema
const items: FeedItem[] = [
{
id: longId,
source: 'https://example.com/feed.xml',
title: 'Long ID Article',
url: 'https://example.com/long',
publishedAt: new Date('2024-09-06T09:00:00Z'),
},
];
await dedup.markSeen(items);
const filtered = await dedup.filter(items);
expect(filtered).toHaveLength(0);
});
it('handles duplicate IDs in single filter call', async () => {
const item: FeedItem = {
id: 'duplicate-in-input',
source: 'https://example.com/feed.xml',
title: 'Article',
url: 'https://example.com/article',
publishedAt: new Date('2024-09-06T09:00:00Z'),
};
// Same item twice in input
const items = [item, item];
// Filter returns both since neither has been marked seen
// (filter only removes items from seen_ids table, not deduplicates input)
const filtered = await dedup.filter(items);
expect(filtered).toHaveLength(2);
// Mark one as seen
await dedup.markSeen([item]);
// Now both are filtered since they share the same ID
const filteredAgain = await dedup.filter(items);
expect(filteredAgain).toHaveLength(0);
});
});
});