- Add storage module with SQLite persistence via better-sqlite3 - Add deduplication module for feed item dedup - Add infrastructure directory for deployment config - Add .env.example for environment variables - Update dependencies: kysely, better-sqlite3, pg
401 lines
12 KiB
TypeScript
401 lines
12 KiB
TypeScript
import { describe, it, expect, beforeEach, afterAll } from 'vitest';
|
|
import BetterSqlite3 from 'better-sqlite3';
|
|
import { Kysely, SqliteDialect } from 'kysely';
|
|
import { DatabaseDedup } from './dedup.js';
|
|
import { migrate, reset } from '../../infrastructure/db/schema.js';
|
|
import type { Database } from '../../infrastructure/db/database.js';
|
|
import type { FeedItem } from '../../interfaces/feed.types.js';
|
|
|
|
describe('DatabaseDedup', () => {
|
|
let sqliteDb: BetterSqlite3.Database;
|
|
let db: Kysely<Database>;
|
|
let dedup: DatabaseDedup;
|
|
|
|
beforeEach(async () => {
|
|
// Create in-memory database for each test
|
|
sqliteDb = new BetterSqlite3(':memory:');
|
|
sqliteDb.pragma('journal_mode = WAL');
|
|
|
|
db = new Kysely<Database>({
|
|
dialect: new SqliteDialect({
|
|
database: sqliteDb,
|
|
}),
|
|
});
|
|
|
|
// Reset and migrate
|
|
await reset(db);
|
|
await migrate(db);
|
|
|
|
dedup = new DatabaseDedup(db);
|
|
});
|
|
|
|
afterAll(async () => {
|
|
await db.destroy();
|
|
});
|
|
|
|
describe('filter', () => {
|
|
it('returns all items when nothing is marked seen', async () => {
|
|
const items: FeedItem[] = [
|
|
{
|
|
id: 'item1',
|
|
source: 'https://example.com/feed.xml',
|
|
title: 'Article 1',
|
|
url: 'https://example.com/1',
|
|
publishedAt: new Date('2024-09-06T09:00:00Z'),
|
|
},
|
|
{
|
|
id: 'item2',
|
|
source: 'https://example.com/feed.xml',
|
|
title: 'Article 2',
|
|
url: 'https://example.com/2',
|
|
publishedAt: new Date('2024-09-06T10:00:00Z'),
|
|
},
|
|
];
|
|
|
|
const filtered = await dedup.filter(items);
|
|
expect(filtered).toHaveLength(2);
|
|
expect(filtered[0].id).toBe('item1');
|
|
expect(filtered[1].id).toBe('item2');
|
|
});
|
|
|
|
it('excludes items that have been marked seen', async () => {
|
|
const items: FeedItem[] = [
|
|
{
|
|
id: 'item1',
|
|
source: 'https://example.com/feed.xml',
|
|
title: 'Article 1',
|
|
url: 'https://example.com/1',
|
|
publishedAt: new Date('2024-09-06T09:00:00Z'),
|
|
},
|
|
{
|
|
id: 'item2',
|
|
source: 'https://example.com/feed.xml',
|
|
title: 'Article 2',
|
|
url: 'https://example.com/2',
|
|
publishedAt: new Date('2024-09-06T10:00:00Z'),
|
|
},
|
|
];
|
|
|
|
// Mark first item as seen
|
|
await dedup.markSeen([items[0]]);
|
|
|
|
const filtered = await dedup.filter(items);
|
|
expect(filtered).toHaveLength(1);
|
|
expect(filtered[0].id).toBe('item2');
|
|
});
|
|
|
|
it('returns empty array when all items are seen', async () => {
|
|
const items: FeedItem[] = [
|
|
{
|
|
id: 'item1',
|
|
source: 'https://example.com/feed.xml',
|
|
title: 'Article 1',
|
|
url: 'https://example.com/1',
|
|
publishedAt: new Date('2024-09-06T09:00:00Z'),
|
|
},
|
|
];
|
|
|
|
await dedup.markSeen(items);
|
|
|
|
const filtered = await dedup.filter(items);
|
|
expect(filtered).toHaveLength(0);
|
|
});
|
|
|
|
it('returns empty array for empty input', async () => {
|
|
const filtered = await dedup.filter([]);
|
|
expect(filtered).toHaveLength(0);
|
|
});
|
|
|
|
it('handles partial matches correctly', async () => {
|
|
const items: FeedItem[] = [
|
|
{
|
|
id: 'seen-item',
|
|
source: 'https://example.com/feed.xml',
|
|
title: 'Seen Article',
|
|
url: 'https://example.com/seen',
|
|
publishedAt: new Date('2024-09-06T09:00:00Z'),
|
|
},
|
|
{
|
|
id: 'new-item',
|
|
source: 'https://example.com/feed.xml',
|
|
title: 'New Article',
|
|
url: 'https://example.com/new',
|
|
publishedAt: new Date('2024-09-06T10:00:00Z'),
|
|
},
|
|
{
|
|
id: 'another-seen',
|
|
source: 'https://example.com/feed.xml',
|
|
title: 'Another Seen',
|
|
url: 'https://example.com/another',
|
|
publishedAt: new Date('2024-09-06T11:00:00Z'),
|
|
},
|
|
];
|
|
|
|
await dedup.markSeen([items[0], items[2]]);
|
|
|
|
const filtered = await dedup.filter(items);
|
|
expect(filtered).toHaveLength(1);
|
|
expect(filtered[0].id).toBe('new-item');
|
|
});
|
|
});
|
|
|
|
describe('markSeen', () => {
|
|
it('marks items as seen', async () => {
|
|
const items: FeedItem[] = [
|
|
{
|
|
id: 'item1',
|
|
source: 'https://example.com/feed.xml',
|
|
title: 'Article 1',
|
|
url: 'https://example.com/1',
|
|
publishedAt: new Date('2024-09-06T09:00:00Z'),
|
|
},
|
|
];
|
|
|
|
await dedup.markSeen(items);
|
|
|
|
const filtered = await dedup.filter(items);
|
|
expect(filtered).toHaveLength(0);
|
|
});
|
|
|
|
it('marks multiple items at once', async () => {
|
|
const items: FeedItem[] = [
|
|
{
|
|
id: 'item1',
|
|
source: 'https://example.com/feed.xml',
|
|
title: 'Article 1',
|
|
url: 'https://example.com/1',
|
|
publishedAt: new Date('2024-09-06T09:00:00Z'),
|
|
},
|
|
{
|
|
id: 'item2',
|
|
source: 'https://example.com/feed.xml',
|
|
title: 'Article 2',
|
|
url: 'https://example.com/2',
|
|
publishedAt: new Date('2024-09-06T10:00:00Z'),
|
|
},
|
|
];
|
|
|
|
await dedup.markSeen(items);
|
|
|
|
const filtered = await dedup.filter(items);
|
|
expect(filtered).toHaveLength(0);
|
|
});
|
|
|
|
it('handles empty array gracefully', async () => {
|
|
await dedup.markSeen([]);
|
|
// Should not throw
|
|
});
|
|
|
|
it('is idempotent - marking same item twice does not error', async () => {
|
|
const item: FeedItem = {
|
|
id: 'duplicate-id',
|
|
source: 'https://example.com/feed.xml',
|
|
title: 'Article',
|
|
url: 'https://example.com/article',
|
|
publishedAt: new Date('2024-09-06T09:00:00Z'),
|
|
};
|
|
|
|
await dedup.markSeen([item]);
|
|
await dedup.markSeen([item]); // Should not throw
|
|
|
|
const filtered = await dedup.filter([item]);
|
|
expect(filtered).toHaveLength(0);
|
|
});
|
|
|
|
it('marks items incrementally', async () => {
|
|
const item1: FeedItem = {
|
|
id: 'item1',
|
|
source: 'https://example.com/feed.xml',
|
|
title: 'Article 1',
|
|
url: 'https://example.com/1',
|
|
publishedAt: new Date('2024-09-06T09:00:00Z'),
|
|
};
|
|
|
|
const item2: FeedItem = {
|
|
id: 'item2',
|
|
source: 'https://example.com/feed.xml',
|
|
title: 'Article 2',
|
|
url: 'https://example.com/2',
|
|
publishedAt: new Date('2024-09-06T10:00:00Z'),
|
|
};
|
|
|
|
// Mark first item
|
|
await dedup.markSeen([item1]);
|
|
|
|
let filtered = await dedup.filter([item1, item2]);
|
|
expect(filtered).toHaveLength(1);
|
|
expect(filtered[0].id).toBe('item2');
|
|
|
|
// Mark second item
|
|
await dedup.markSeen([item2]);
|
|
|
|
filtered = await dedup.filter([item1, item2]);
|
|
expect(filtered).toHaveLength(0);
|
|
});
|
|
});
|
|
|
|
describe('integration scenarios', () => {
|
|
it('end-to-end: filter then mark workflow', async () => {
|
|
const items: FeedItem[] = [
|
|
{
|
|
id: 'new1',
|
|
source: 'https://example.com/feed.xml',
|
|
title: 'New Article 1',
|
|
url: 'https://example.com/new1',
|
|
publishedAt: new Date('2024-09-06T09:00:00Z'),
|
|
},
|
|
{
|
|
id: 'new2',
|
|
source: 'https://example.com/feed.xml',
|
|
title: 'New Article 2',
|
|
url: 'https://example.com/new2',
|
|
publishedAt: new Date('2024-09-06T10:00:00Z'),
|
|
},
|
|
];
|
|
|
|
// Simulate feed fetch
|
|
const newItems = await dedup.filter(items);
|
|
expect(newItems).toHaveLength(2);
|
|
|
|
// Mark as seen after display
|
|
await dedup.markSeen(newItems);
|
|
|
|
// Next fetch should return empty
|
|
const nextFetch = await dedup.filter(items);
|
|
expect(nextFetch).toHaveLength(0);
|
|
});
|
|
|
|
it('handles items with same IDs from different sources', async () => {
|
|
// This shouldn't happen with proper ID generation, but test it anyway
|
|
const items: FeedItem[] = [
|
|
{
|
|
id: 'same-hash-id',
|
|
source: 'https://source1.com/feed.xml',
|
|
title: 'Article from Source 1',
|
|
url: 'https://source1.com/article',
|
|
publishedAt: new Date('2024-09-06T09:00:00Z'),
|
|
},
|
|
{
|
|
id: 'same-hash-id',
|
|
source: 'https://source2.com/feed.xml',
|
|
title: 'Article from Source 2',
|
|
url: 'https://source2.com/article',
|
|
publishedAt: new Date('2024-09-06T09:00:00Z'),
|
|
},
|
|
];
|
|
|
|
// Mark first as seen
|
|
await dedup.markSeen([items[0]]);
|
|
|
|
// Both should be filtered since they have same ID
|
|
const filtered = await dedup.filter(items);
|
|
expect(filtered).toHaveLength(0);
|
|
});
|
|
|
|
it('preserves item data integrity', async () => {
|
|
const item: FeedItem = {
|
|
id: 'complete-item',
|
|
source: 'https://example.com/feed.xml',
|
|
title: 'Complete Article',
|
|
url: 'https://example.com/complete',
|
|
publishedAt: new Date('2024-09-06T09:30:00Z'),
|
|
summary: 'A summary',
|
|
content: 'Full content',
|
|
};
|
|
|
|
// Mark as seen
|
|
await dedup.markSeen([item]);
|
|
|
|
// Filter should exclude the item
|
|
const filtered = await dedup.filter([item]);
|
|
expect(filtered).toHaveLength(0);
|
|
});
|
|
|
|
it('handles large batches efficiently', async () => {
|
|
// Create 100 items
|
|
const items: FeedItem[] = Array.from({ length: 100 }, (_, i) => ({
|
|
id: `batch-item-${i}`,
|
|
source: 'https://example.com/feed.xml',
|
|
title: `Article ${i}`,
|
|
url: `https://example.com/${i}`,
|
|
publishedAt: new Date(`2024-09-${String(i + 1).padStart(2, '0')}T09:00:00Z`),
|
|
}));
|
|
|
|
// All should be returned initially
|
|
const filtered = await dedup.filter(items);
|
|
expect(filtered).toHaveLength(100);
|
|
|
|
// Mark half as seen
|
|
const seenItems = items.slice(0, 50);
|
|
await dedup.markSeen(seenItems);
|
|
|
|
// Only unseen items should be returned
|
|
const filteredAgain = await dedup.filter(items);
|
|
expect(filteredAgain).toHaveLength(50);
|
|
expect(filteredAgain[0].id).toBe('batch-item-50');
|
|
});
|
|
});
|
|
|
|
describe('edge cases', () => {
|
|
it('handles items with special characters in IDs', async () => {
|
|
const items: FeedItem[] = [
|
|
{
|
|
id: 'item-with-::special::chars',
|
|
source: 'https://example.com/feed.xml',
|
|
title: 'Special ID Article',
|
|
url: 'https://example.com/special',
|
|
publishedAt: new Date('2024-09-06T09:00:00Z'),
|
|
},
|
|
];
|
|
|
|
await dedup.markSeen(items);
|
|
|
|
const filtered = await dedup.filter(items);
|
|
expect(filtered).toHaveLength(0);
|
|
});
|
|
|
|
it('handles very long IDs', async () => {
|
|
const longId = 'a'.repeat(64); // Max length per schema
|
|
const items: FeedItem[] = [
|
|
{
|
|
id: longId,
|
|
source: 'https://example.com/feed.xml',
|
|
title: 'Long ID Article',
|
|
url: 'https://example.com/long',
|
|
publishedAt: new Date('2024-09-06T09:00:00Z'),
|
|
},
|
|
];
|
|
|
|
await dedup.markSeen(items);
|
|
|
|
const filtered = await dedup.filter(items);
|
|
expect(filtered).toHaveLength(0);
|
|
});
|
|
|
|
it('handles duplicate IDs in single filter call', async () => {
|
|
const item: FeedItem = {
|
|
id: 'duplicate-in-input',
|
|
source: 'https://example.com/feed.xml',
|
|
title: 'Article',
|
|
url: 'https://example.com/article',
|
|
publishedAt: new Date('2024-09-06T09:00:00Z'),
|
|
};
|
|
|
|
// Same item twice in input
|
|
const items = [item, item];
|
|
|
|
// Filter returns both since neither has been marked seen
|
|
// (filter only removes items from seen_ids table, not deduplicates input)
|
|
const filtered = await dedup.filter(items);
|
|
expect(filtered).toHaveLength(2);
|
|
|
|
// Mark one as seen
|
|
await dedup.markSeen([item]);
|
|
|
|
// Now both are filtered since they share the same ID
|
|
const filteredAgain = await dedup.filter(items);
|
|
expect(filteredAgain).toHaveLength(0);
|
|
});
|
|
});
|
|
});
|