pulse/modules/dedup/dedup.ts
Edo Limburg 40ccbbad1a Add storage, dedup modules and infrastructure configuration
- Add storage module with SQLite persistence via better-sqlite3
- Add deduplication module for feed item dedup
- Add infrastructure directory for deployment config
- Add .env.example for environment variables
- Update dependencies: kysely, better-sqlite3, pg
2026-05-05 21:59:50 +02:00

56 lines
1.4 KiB
TypeScript

/**
* Deduplication module implementation.
* Tracks seen item IDs to filter duplicates from feeds.
*/
import type { Kysely } from 'kysely';
import type { FeedItem } from '../../interfaces/feed.types.js';
import type { IDedup } from '../../interfaces/dedup.interface.js';
import type { Database, SeenIdTable } from '../../infrastructure/db/database.js';
export class DatabaseDedup implements IDedup {
private readonly db: Kysely<Database>;
constructor(db: Kysely<Database>) {
this.db = db;
}
async filter(items: FeedItem[]): Promise<FeedItem[]> {
if (items.length === 0) {
return [];
}
const ids = items.map((item) => item.id);
// Query which IDs are already in the seen table
const seenRows = await this.db
.selectFrom('seen_ids')
.select('id')
.where('id', 'in', ids)
.execute();
const seenIds = new Set(seenRows.map((row) => row.id));
// Return only items NOT in seen table
return items.filter((item) => !seenIds.has(item.id));
}
async markSeen(items: FeedItem[]): Promise<void> {
if (items.length === 0) {
return;
}
const rows: SeenIdTable[] = items.map((item) => ({
id: item.id,
seen_at: new Date().toISOString(),
}));
// Insert or ignore (idempotent)
await this.db
.insertInto('seen_ids')
.values(rows)
.onConflict((oc) => oc.column('id').doNothing())
.execute();
}
}