Add CLI entry point, RSS content extraction, and image support
Features: - Add CLI with commands: start, add, remove, list, fetch, status, items - Auto-detect RSS format when adding feeds - Auto-run database migrations on startup - Extract full HTML content from RSS description field (NOS-style feeds) - Extract image URLs from RSS enclosure tags - Display images in terminal output with emoji - Include imageUrl in JSON formatter output Database: - Add image_url column to feed_items table - Update storage layer to persist imageUrl field Tests: - Add 10 CLI integration tests - Add 3 RSS parser tests for image/content extraction - Add 2 storage tests for imageUrl persistence Dependencies: - Add commander for CLI framework All 144 tests passing
This commit is contained in:
parent
78a2b27f6d
commit
c79eb6d76d
3
.gitignore
vendored
3
.gitignore
vendored
@ -6,3 +6,6 @@ dist/
|
||||
build/
|
||||
.env
|
||||
.env.local
|
||||
*.db
|
||||
*.db-journal
|
||||
data/
|
||||
|
||||
127
cli.test.ts
Normal file
127
cli.test.ts
Normal file
@ -0,0 +1,127 @@
|
||||
/**
|
||||
* CLI tests.
|
||||
* Tests the Pulse CLI commands.
|
||||
*/
|
||||
|
||||
import { describe, it, expect, beforeEach, afterEach } from 'vitest';
|
||||
import { execSync } from 'child_process';
|
||||
import { existsSync, unlinkSync } from 'fs';
|
||||
import { join } from 'path';
|
||||
|
||||
const TEST_DB_PATH = join(process.cwd(), 'test-cli.db');
|
||||
const CLI_CMD = 'npx tsx cli.ts';
|
||||
|
||||
function runCli(args: string): string {
|
||||
const env = {
|
||||
...process.env,
|
||||
PULSE_DATABASE_TYPE: 'sqlite',
|
||||
PULSE_SQLITE_PATH: TEST_DB_PATH,
|
||||
};
|
||||
|
||||
try {
|
||||
return execSync(`${CLI_CMD} ${args}`, {
|
||||
encoding: 'utf-8',
|
||||
env,
|
||||
stdio: 'pipe',
|
||||
});
|
||||
} catch (error) {
|
||||
if (error instanceof Error && 'stdout' in error && error.stdout) {
|
||||
return error.stdout as string;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
describe('CLI', () => {
|
||||
beforeEach(() => {
|
||||
// Clean up test database before each test
|
||||
if (existsSync(TEST_DB_PATH)) {
|
||||
unlinkSync(TEST_DB_PATH);
|
||||
}
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
// Clean up test database after each test
|
||||
if (existsSync(TEST_DB_PATH)) {
|
||||
unlinkSync(TEST_DB_PATH);
|
||||
}
|
||||
});
|
||||
|
||||
describe('help', () => {
|
||||
it('should display help', () => {
|
||||
const output = runCli('--help');
|
||||
expect(output).toContain('Usage: pulse');
|
||||
expect(output).toContain('Commands:');
|
||||
expect(output).toContain('start');
|
||||
expect(output).toContain('add');
|
||||
expect(output).toContain('list');
|
||||
});
|
||||
|
||||
it('should display version', () => {
|
||||
const output = runCli('--version');
|
||||
expect(output.trim()).toBe('0.1.0');
|
||||
});
|
||||
});
|
||||
|
||||
describe('list', () => {
|
||||
it('should show empty list when no feeds', () => {
|
||||
const output = runCli('list');
|
||||
expect(output).toContain('No feed sources configured');
|
||||
});
|
||||
});
|
||||
|
||||
describe('add', () => {
|
||||
it('should add a feed source', () => {
|
||||
const output = runCli('add "https://example.com/feed.xml" --name "Test Feed" --format rss');
|
||||
expect(output).toContain('Added feed:');
|
||||
expect(output).toContain('https://example.com/feed.xml');
|
||||
expect(output).toContain('rss');
|
||||
});
|
||||
|
||||
it('should reject invalid format', () => {
|
||||
expect(() => {
|
||||
runCli('add "https://example.com/feed.xml" --format invalid');
|
||||
}).toThrow();
|
||||
});
|
||||
});
|
||||
|
||||
describe('remove', () => {
|
||||
it('should remove a feed source', () => {
|
||||
// First add a feed
|
||||
runCli('add "https://example.com/feed.xml" --name "Test Feed" --format rss');
|
||||
|
||||
// Then remove it
|
||||
const output = runCli('remove 9bb0a00b');
|
||||
expect(output).toContain('Removed feed:');
|
||||
});
|
||||
|
||||
it('should error when removing non-existent feed', () => {
|
||||
expect(() => {
|
||||
runCli('remove non-existent-id');
|
||||
}).toThrow();
|
||||
});
|
||||
});
|
||||
|
||||
describe('status', () => {
|
||||
it('should show status for no feeds', () => {
|
||||
const output = runCli('status');
|
||||
expect(output).toContain('No feed sources configured');
|
||||
});
|
||||
|
||||
it('should show status for healthy feed', () => {
|
||||
// Add a feed
|
||||
runCli('add "https://example.com/feed.xml" --name "Test Feed" --format rss');
|
||||
|
||||
const output = runCli('status');
|
||||
expect(output).toContain('Healthy');
|
||||
expect(output).toContain('Test Feed');
|
||||
});
|
||||
});
|
||||
|
||||
describe('items', () => {
|
||||
it('should show no items when database is empty', () => {
|
||||
const output = runCli('items');
|
||||
expect(output).toContain('No items found');
|
||||
});
|
||||
});
|
||||
});
|
||||
387
cli.ts
Executable file
387
cli.ts
Executable file
@ -0,0 +1,387 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Pulse CLI - RSS Feed Aggregator
|
||||
*
|
||||
* Commands:
|
||||
* pulse start Start the orchestrator (foreground mode)
|
||||
* pulse add <url> Add a new feed source
|
||||
* pulse remove <id> Remove a feed source
|
||||
* pulse list List all feed sources
|
||||
* pulse fetch One-shot fetch of all active feeds
|
||||
* pulse status Show feed health status
|
||||
* pulse items [limit] Show recent items (default: 20)
|
||||
*/
|
||||
|
||||
import { Command } from 'commander';
|
||||
import { getDatabase, migrate } from './infrastructure/db/index.js';
|
||||
import { SqlStorage } from './modules/storage/index.js';
|
||||
import { DatabaseDedup } from './modules/dedup/index.js';
|
||||
import { HttpFetcher } from './modules/fetcher/index.js';
|
||||
import { RssParser, AtomParser } from './modules/parser/index.js';
|
||||
import { FeedOrchestrator } from './orchestrator/index.js';
|
||||
import { Formatter } from './modules/formatter/formatter.js';
|
||||
import { request } from 'undici';
|
||||
import type { FeedSource } from './interfaces/feed.types.js';
|
||||
|
||||
const program = new Command();
|
||||
|
||||
program
|
||||
.name('pulse')
|
||||
.description('RSS feed aggregator')
|
||||
.version('0.1.0');
|
||||
|
||||
// Helper to create wired-up modules
|
||||
async function createContext() {
|
||||
const db = getDatabase();
|
||||
|
||||
// Auto-run migrations
|
||||
await migrate(db);
|
||||
|
||||
const storage = new SqlStorage(db);
|
||||
const dedup = new DatabaseDedup(db);
|
||||
const fetcher = new HttpFetcher();
|
||||
const parser = {
|
||||
async parse(rawXml: string, source: string) {
|
||||
// Try RSS first, then Atom
|
||||
const rssParser = new RssParser();
|
||||
const atomParser = new AtomParser();
|
||||
|
||||
try {
|
||||
return await rssParser.parse(rawXml, source);
|
||||
} catch {
|
||||
return await atomParser.parse(rawXml, source);
|
||||
}
|
||||
},
|
||||
supports() {
|
||||
return true;
|
||||
},
|
||||
};
|
||||
|
||||
return { db, storage, dedup, fetcher, parser };
|
||||
}
|
||||
|
||||
// Helper to generate feed ID from URL
|
||||
function generateFeedId(url: string): string {
|
||||
let hash = 5381;
|
||||
for (let i = 0; i < url.length; i++) {
|
||||
hash = ((hash << 5) + hash) + url.charCodeAt(i);
|
||||
}
|
||||
return (hash >>> 0).toString(16);
|
||||
}
|
||||
|
||||
// Helper to auto-detect feed format
|
||||
async function detectFeedFormat(url: string): Promise<'rss' | 'atom'> {
|
||||
try {
|
||||
const { headers, body } = await request(url, {
|
||||
method: 'GET',
|
||||
headers: { 'User-Agent': 'Pulse-RSS-Fetcher/1.0' },
|
||||
signal: AbortSignal.timeout(10000),
|
||||
});
|
||||
|
||||
const contentType = (headers['content-type'] as string | undefined) ?? '';
|
||||
const responseBody = await body.text();
|
||||
|
||||
// Check content-type header
|
||||
if (contentType.toLowerCase().includes('atom')) {
|
||||
return 'atom';
|
||||
}
|
||||
|
||||
// Check body content
|
||||
const trimmed = responseBody.trim().toLowerCase();
|
||||
if (trimmed.includes('<feed')) {
|
||||
return 'atom';
|
||||
}
|
||||
|
||||
return 'rss';
|
||||
} catch {
|
||||
// Default to RSS if detection fails
|
||||
return 'rss';
|
||||
}
|
||||
}
|
||||
|
||||
// Start command
|
||||
program
|
||||
.command('start')
|
||||
.description('Start the orchestrator (runs in foreground)')
|
||||
.option('-c, --concurrency <number>', 'Number of concurrent fetches', '5')
|
||||
.action(async (options) => {
|
||||
try {
|
||||
const { db, storage, dedup, fetcher, parser } = await createContext();
|
||||
|
||||
const orchestrator = new FeedOrchestrator({
|
||||
storage,
|
||||
fetcher,
|
||||
parser,
|
||||
dedup,
|
||||
concurrency: parseInt(options.concurrency, 10),
|
||||
});
|
||||
|
||||
console.log('Starting Pulse orchestrator...');
|
||||
|
||||
// Handle graceful shutdown
|
||||
const shutdown = async () => {
|
||||
console.log('\nShutting down gracefully...');
|
||||
await orchestrator.stop();
|
||||
// Note: Kysely doesn't have a close method, but we should clean up if needed
|
||||
console.log('Shutdown complete.');
|
||||
process.exit(0);
|
||||
};
|
||||
|
||||
process.on('SIGINT', shutdown);
|
||||
process.on('SIGTERM', shutdown);
|
||||
|
||||
await orchestrator.start();
|
||||
console.log('Orchestrator running. Press Ctrl+C to stop.');
|
||||
|
||||
// Keep process alive
|
||||
await new Promise(() => {});
|
||||
} catch (error) {
|
||||
console.error('Failed to start orchestrator:', error instanceof Error ? error.message : error);
|
||||
process.exit(1);
|
||||
}
|
||||
});
|
||||
|
||||
// Add command
|
||||
program
|
||||
.command('add <url>')
|
||||
.description('Add a new feed source')
|
||||
.option('-n, --name <name>', 'Display name for the feed')
|
||||
.option('-f, --format <format>', 'Feed format (rss or atom)', 'auto')
|
||||
.option('-i, --interval <ms>', 'Poll interval in milliseconds', '300000')
|
||||
.action(async (url, options) => {
|
||||
try {
|
||||
const { storage } = await createContext();
|
||||
|
||||
const id = generateFeedId(url);
|
||||
|
||||
// Check if already exists
|
||||
const existing = await storage.getFeedSourceById(id);
|
||||
if (existing) {
|
||||
console.error(`Feed already exists: ${url}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Auto-detect format if needed
|
||||
let format: 'rss' | 'atom';
|
||||
if (options.format === 'auto') {
|
||||
console.log('Detecting feed format...');
|
||||
format = await detectFeedFormat(url);
|
||||
console.log(`Detected format: ${format}`);
|
||||
} else if (options.format === 'rss' || options.format === 'atom') {
|
||||
format = options.format;
|
||||
} else {
|
||||
console.error(`Invalid format: ${options.format}. Use 'rss', 'atom', or 'auto'.`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const now = new Date();
|
||||
const source: FeedSource = {
|
||||
id,
|
||||
url,
|
||||
name: options.name ?? null,
|
||||
format,
|
||||
pollIntervalMs: parseInt(options.interval, 10),
|
||||
isActive: true,
|
||||
lastFetchedAt: null,
|
||||
lastSuccessAt: null,
|
||||
consecutiveFailures: 0,
|
||||
createdAt: now,
|
||||
updatedAt: now,
|
||||
};
|
||||
|
||||
await storage.saveFeedSource(source);
|
||||
console.log(`Added feed: ${url}`);
|
||||
console.log(` ID: ${id}`);
|
||||
console.log(` Format: ${format}`);
|
||||
console.log(` Interval: ${options.interval}ms`);
|
||||
} catch (error) {
|
||||
console.error('Failed to add feed:', error instanceof Error ? error.message : error);
|
||||
process.exit(1);
|
||||
}
|
||||
});
|
||||
|
||||
// Remove command
|
||||
program
|
||||
.command('remove <id>')
|
||||
.description('Remove a feed source by ID')
|
||||
.action(async (id) => {
|
||||
try {
|
||||
const { storage } = await createContext();
|
||||
|
||||
const existing = await storage.getFeedSourceById(id);
|
||||
if (!existing) {
|
||||
console.error(`Feed not found: ${id}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
await storage.deleteFeedSource(id);
|
||||
console.log(`Removed feed: ${existing.name ?? existing.url} (${id})`);
|
||||
} catch (error) {
|
||||
console.error('Failed to remove feed:', error instanceof Error ? error.message : error);
|
||||
process.exit(1);
|
||||
}
|
||||
});
|
||||
|
||||
// List command
|
||||
program
|
||||
.command('list')
|
||||
.description('List all feed sources')
|
||||
.action(async () => {
|
||||
try {
|
||||
const { storage } = await createContext();
|
||||
|
||||
const sources = await storage.getFeedSources();
|
||||
|
||||
if (sources.length === 0) {
|
||||
console.log('No feed sources configured.');
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`\n ${sources.length} feed source(s):\n`);
|
||||
|
||||
sources.forEach((source) => {
|
||||
const status = source.isActive ? '✓' : '✗';
|
||||
const health = source.consecutiveFailures > 0 ? ` (${source.consecutiveFailures} failures)` : '';
|
||||
console.log(` ${status} ${source.name ?? source.url}`);
|
||||
console.log(` ID: ${source.id}`);
|
||||
console.log(` URL: ${source.url}`);
|
||||
console.log(` Format: ${source.format}`);
|
||||
console.log(` Interval: ${source.pollIntervalMs}ms`);
|
||||
if (source.lastSuccessAt) {
|
||||
console.log(` Last success: ${source.lastSuccessAt.toLocaleString()}`);
|
||||
}
|
||||
console.log(` ${health}`);
|
||||
console.log();
|
||||
});
|
||||
} catch (error) {
|
||||
console.error('Failed to list feeds:', error instanceof Error ? error.message : error);
|
||||
process.exit(1);
|
||||
}
|
||||
});
|
||||
|
||||
// Fetch command
|
||||
program
|
||||
.command('fetch')
|
||||
.description('One-shot fetch of all active feeds')
|
||||
.action(async () => {
|
||||
try {
|
||||
const { db, storage, dedup, fetcher, parser } = await createContext();
|
||||
|
||||
const orchestrator = new FeedOrchestrator({
|
||||
storage,
|
||||
fetcher,
|
||||
parser,
|
||||
dedup,
|
||||
});
|
||||
|
||||
console.log('Fetching all active feeds...\n');
|
||||
|
||||
const results = await orchestrator.processAllFeeds();
|
||||
|
||||
let totalFound = 0;
|
||||
let totalNew = 0;
|
||||
|
||||
for (const [sourceId, result] of results) {
|
||||
const source = await storage.getFeedSourceById(sourceId);
|
||||
const name = source?.name ?? source?.url ?? sourceId;
|
||||
|
||||
if (result.success) {
|
||||
console.log(`✓ ${name}`);
|
||||
console.log(` Found: ${result.itemsFound} items`);
|
||||
console.log(` New: ${result.itemsNew} items`);
|
||||
totalFound += result.itemsFound;
|
||||
totalNew += result.itemsNew;
|
||||
} else {
|
||||
console.log(`✗ ${name}`);
|
||||
console.log(` Error: ${result.error?.reason ?? 'Unknown error'}`);
|
||||
}
|
||||
console.log();
|
||||
}
|
||||
|
||||
console.log(`Total: ${totalFound} items found, ${totalNew} new items`);
|
||||
} catch (error) {
|
||||
console.error('Failed to fetch feeds:', error instanceof Error ? error.message : error);
|
||||
process.exit(1);
|
||||
}
|
||||
});
|
||||
|
||||
// Status command
|
||||
program
|
||||
.command('status')
|
||||
.description('Show feed health status')
|
||||
.action(async () => {
|
||||
try {
|
||||
const { db, storage, dedup, fetcher, parser } = await createContext();
|
||||
|
||||
const orchestrator = new FeedOrchestrator({
|
||||
storage,
|
||||
fetcher,
|
||||
parser,
|
||||
dedup,
|
||||
});
|
||||
|
||||
const health = await orchestrator.getFeedHealth();
|
||||
|
||||
if (health.length === 0) {
|
||||
console.log('No feed sources configured.');
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`\n Feed Health Status:\n`);
|
||||
|
||||
const healthy = health.filter((h) => h.isHealthy);
|
||||
const unhealthy = health.filter((h) => !h.isHealthy);
|
||||
|
||||
if (healthy.length > 0) {
|
||||
console.log(` Healthy (${healthy.length}):`);
|
||||
healthy.forEach((h) => {
|
||||
console.log(` ✓ ${h.name ?? h.url}`);
|
||||
});
|
||||
console.log();
|
||||
}
|
||||
|
||||
if (unhealthy.length > 0) {
|
||||
console.log(` Unhealthy (${unhealthy.length}):`);
|
||||
unhealthy.forEach((h) => {
|
||||
console.log(` ✗ ${h.name ?? h.url}`);
|
||||
console.log(` Failures: ${h.consecutiveFailures}`);
|
||||
if (h.lastSuccessAt) {
|
||||
console.log(` Last success: ${h.lastSuccessAt.toLocaleString()}`);
|
||||
} else {
|
||||
console.log(` Never successfully fetched`);
|
||||
}
|
||||
});
|
||||
console.log();
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Failed to get status:', error instanceof Error ? error.message : error);
|
||||
process.exit(1);
|
||||
}
|
||||
});
|
||||
|
||||
// Items command
|
||||
program
|
||||
.command('items [limit]')
|
||||
.description('Show recent items (default: 20)')
|
||||
.action(async (limitStr) => {
|
||||
try {
|
||||
const { storage } = await createContext();
|
||||
const formatter = new Formatter();
|
||||
|
||||
const limit = parseInt(limitStr ?? '20', 10);
|
||||
const items = await storage.getRecent(limit);
|
||||
|
||||
if (items.length === 0) {
|
||||
console.log('No items found.');
|
||||
return;
|
||||
}
|
||||
|
||||
const output = await formatter.format(items, 'terminal');
|
||||
console.log(output);
|
||||
} catch (error) {
|
||||
console.error('Failed to get items:', error instanceof Error ? error.message : error);
|
||||
process.exit(1);
|
||||
}
|
||||
});
|
||||
|
||||
program.parse();
|
||||
@ -11,6 +11,7 @@ export interface FeedItemTable {
|
||||
published_at: string; // ISO 8601 format
|
||||
content: string | null;
|
||||
summary: string | null;
|
||||
image_url: string | null;
|
||||
created_at: string; // ISO 8601 format
|
||||
}
|
||||
|
||||
|
||||
@ -18,6 +18,7 @@ export async function migrate(db: Kysely<Database>): Promise<void> {
|
||||
.addColumn('published_at', 'varchar(32)', (col) => col.notNull())
|
||||
.addColumn('content', 'text')
|
||||
.addColumn('summary', 'text')
|
||||
.addColumn('image_url', 'varchar(2048)')
|
||||
.addColumn('created_at', 'varchar(32)', (col) => col.notNull().defaultTo('CURRENT_TIMESTAMP'))
|
||||
.execute();
|
||||
|
||||
|
||||
@ -6,6 +6,7 @@ export interface FeedItem {
|
||||
publishedAt: Date;
|
||||
content?: string;
|
||||
summary?: string;
|
||||
imageUrl?: string;
|
||||
}
|
||||
|
||||
export interface FetchInput {
|
||||
|
||||
@ -8,6 +8,7 @@ interface JsonFeedItem {
|
||||
publishedAt: string;
|
||||
content?: string;
|
||||
summary?: string;
|
||||
imageUrl?: string;
|
||||
}
|
||||
|
||||
export class JsonFormatter {
|
||||
@ -19,7 +20,8 @@ export class JsonFormatter {
|
||||
url: item.url,
|
||||
publishedAt: item.publishedAt.toISOString(),
|
||||
...(item.content !== undefined && { content: item.content }),
|
||||
...(item.summary !== undefined && { summary: item.summary })
|
||||
...(item.summary !== undefined && { summary: item.summary }),
|
||||
...(item.imageUrl !== undefined && { imageUrl: item.imageUrl })
|
||||
}));
|
||||
|
||||
return JSON.stringify(jsonItems, null, 2);
|
||||
|
||||
@ -23,6 +23,10 @@ export class TerminalFormatter {
|
||||
lines.push(` ${this.dim(truncated)}`);
|
||||
}
|
||||
|
||||
if (item.imageUrl) {
|
||||
lines.push(` ${this.dim('📷')} ${this.dim(this.truncate(item.imageUrl, 70))}`);
|
||||
}
|
||||
|
||||
lines.push('');
|
||||
});
|
||||
|
||||
|
||||
@ -169,6 +169,139 @@ describe('RssParser', () => {
|
||||
|
||||
expect(items1[0].id).toBe(items2[0].id);
|
||||
});
|
||||
|
||||
it('uses description as content when no content:encoded and description contains HTML', async () => {
|
||||
// Simulates feeds like NOS that put full HTML content in description
|
||||
const xml = `<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<item>
|
||||
<title><![CDATA[Iran ontkent aanvallen VAE]]></title>
|
||||
<link>https://nos.nl/l/2613264</link>
|
||||
<description><![CDATA[
|
||||
<p>Iran ontkent aanvallen te hebben uitgevoerd op de Verenigde Arabische Emiraten.</p>
|
||||
<p>Gisteren werden er ook al aanvallen gemeld door de VAE.</p>
|
||||
<h2>Onderhandelingen onmogelijk</h2>
|
||||
<p>Iraanse staatsmedia melden dat de Iraanse president Pezeshkian heeft gezegd dat de VS aan de ene kant de druk op Iran opvoert.</p>
|
||||
]]></description>
|
||||
<pubDate>Tue, 5 May 2026 21:44:46 +0200</pubDate>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>`;
|
||||
|
||||
const items = await parser.parse(xml, 'https://feeds.nos.nl/nosnieuwsalgemeen');
|
||||
|
||||
expect(items).toHaveLength(1);
|
||||
expect(items[0].title).toBe('Iran ontkent aanvallen VAE');
|
||||
// Content should contain the full HTML
|
||||
expect(items[0].content).toContain('<p>Iran ontkent aanvallen');
|
||||
expect(items[0].content).toContain('<h2>Onderhandelingen onmogelijk</h2>');
|
||||
// Summary should be extracted from content
|
||||
expect(items[0].summary).toBeDefined();
|
||||
expect(items[0].summary).toContain('Iran ontkent aanvallen');
|
||||
expect(items[0].summary?.length).toBeLessThanOrEqual(210); // 200 + "..."
|
||||
});
|
||||
|
||||
it('uses description as summary when it looks like plain text summary', async () => {
|
||||
const xml = `<?xml version="1.0"?>
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<item>
|
||||
<title>Short Summary Article</title>
|
||||
<link>https://example.com/article</link>
|
||||
<description>This is just a brief summary without HTML tags</description>
|
||||
<pubDate>Mon, 06 Sep 2024 09:00:00 GMT</pubDate>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>`;
|
||||
|
||||
const items = await parser.parse(xml, 'https://example.com/feed.xml');
|
||||
|
||||
expect(items).toHaveLength(1);
|
||||
expect(items[0].summary).toBe('This is just a brief summary without HTML tags');
|
||||
expect(items[0].content).toBeUndefined();
|
||||
});
|
||||
|
||||
it('strips CDATA wrappers from description and content', async () => {
|
||||
const xml = `<?xml version="1.0"?>
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<item>
|
||||
<title><![CDATA[CDATA Title]]></title>
|
||||
<link>https://example.com/article</link>
|
||||
<description><![CDATA[<p>This is a very long content with <strong>formatting</strong> and lots of text to ensure it exceeds the 500 character threshold for being considered full content. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.</p>]]></description>
|
||||
<pubDate>Mon, 06 Sep 2024 09:00:00 GMT</pubDate>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>`;
|
||||
|
||||
const items = await parser.parse(xml, 'https://example.com/feed.xml');
|
||||
|
||||
expect(items[0].title).toBe('CDATA Title');
|
||||
expect(items[0].content).toContain('<p>This is a very long content');
|
||||
expect(items[0].content).toContain('<strong>formatting</strong>');
|
||||
});
|
||||
|
||||
it('extracts image URL from enclosure', async () => {
|
||||
const xml = `<?xml version="1.0"?>
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<item>
|
||||
<title>Article with Image</title>
|
||||
<link>https://example.com/article</link>
|
||||
<description>Article summary</description>
|
||||
<enclosure url="https://example.com/image.jpg" type="image/jpeg" length="12345"/>
|
||||
<pubDate>Mon, 06 Sep 2024 09:00:00 GMT</pubDate>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>`;
|
||||
|
||||
const items = await parser.parse(xml, 'https://example.com/feed.xml');
|
||||
|
||||
expect(items).toHaveLength(1);
|
||||
expect(items[0].imageUrl).toBe('https://example.com/image.jpg');
|
||||
});
|
||||
|
||||
it('extracts first image from multiple enclosures', async () => {
|
||||
const xml = `<?xml version="1.0"?>
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<item>
|
||||
<title>Article with Multiple Enclosures</title>
|
||||
<link>https://example.com/article</link>
|
||||
<description>Article summary</description>
|
||||
<enclosure url="https://example.com/audio.mp3" type="audio/mpeg" length="12345"/>
|
||||
<enclosure url="https://example.com/image.webp" type="image/webp" length="67890"/>
|
||||
<enclosure url="https://example.com/video.mp4" type="video/mp4" length="99999"/>
|
||||
<pubDate>Mon, 06 Sep 2024 09:00:00 GMT</pubDate>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>`;
|
||||
|
||||
const items = await parser.parse(xml, 'https://example.com/feed.xml');
|
||||
|
||||
expect(items).toHaveLength(1);
|
||||
expect(items[0].imageUrl).toBe('https://example.com/image.webp');
|
||||
});
|
||||
|
||||
it('handles items without enclosure', async () => {
|
||||
const xml = `<?xml version="1.0"?>
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<item>
|
||||
<title>Article without Image</title>
|
||||
<link>https://example.com/article</link>
|
||||
<description>Article summary</description>
|
||||
<pubDate>Mon, 06 Sep 2024 09:00:00 GMT</pubDate>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>`;
|
||||
|
||||
const items = await parser.parse(xml, 'https://example.com/feed.xml');
|
||||
|
||||
expect(items).toHaveLength(1);
|
||||
expect(items[0].imageUrl).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
||||
describe('supports', () => {
|
||||
|
||||
@ -9,6 +9,12 @@ interface RssChannel {
|
||||
description?: string;
|
||||
}
|
||||
|
||||
interface RssEnclosure {
|
||||
'@_url'?: string;
|
||||
'@_type'?: string;
|
||||
'@_length'?: string;
|
||||
}
|
||||
|
||||
interface RssItem {
|
||||
title?: string;
|
||||
link?: string;
|
||||
@ -16,6 +22,7 @@ interface RssItem {
|
||||
'content:encoded'?: string;
|
||||
pubDate?: string;
|
||||
guid?: string;
|
||||
enclosure?: RssEnclosure | RssEnclosure[];
|
||||
}
|
||||
|
||||
interface RssFeed {
|
||||
@ -84,17 +91,111 @@ export class RssParser implements IParser {
|
||||
const publishedAt = item.pubDate ? parseDate(item.pubDate) : new Date();
|
||||
const url = item.link;
|
||||
|
||||
// Handle content extraction
|
||||
// Some feeds use content:encoded for full content
|
||||
// Others put full content in description (like NOS)
|
||||
const hasContentEncoded = !!item['content:encoded'];
|
||||
const description = item.description ? this.cleanText(item.description) : undefined;
|
||||
const contentEncoded = item['content:encoded'] ? this.cleanText(item['content:encoded']) : undefined;
|
||||
|
||||
// If there's content:encoded, use it as content and description as summary
|
||||
// If no content:encoded but description looks like full content (contains HTML), use it as content
|
||||
// Otherwise description is just a summary
|
||||
const descriptionLooksLikeContent = description && this.looksLikeFullContent(description);
|
||||
|
||||
let content: string | undefined;
|
||||
let summary: string | undefined;
|
||||
|
||||
if (contentEncoded) {
|
||||
// Standard case: content:encoded has full content, description has summary
|
||||
content = contentEncoded;
|
||||
summary = description;
|
||||
} else if (descriptionLooksLikeContent) {
|
||||
// No content:encoded but description contains full HTML content
|
||||
content = description;
|
||||
// Extract first paragraph or truncate for summary
|
||||
summary = this.extractSummary(description);
|
||||
} else {
|
||||
// Description is just a plain text summary
|
||||
summary = description;
|
||||
content = undefined;
|
||||
}
|
||||
|
||||
// Extract image URL from enclosure
|
||||
const imageUrl = this.extractImageUrl(item.enclosure);
|
||||
|
||||
return {
|
||||
id: generateId(url, publishedAt),
|
||||
source,
|
||||
title: this.cleanText(item.title),
|
||||
url,
|
||||
publishedAt,
|
||||
summary: item.description ? this.cleanText(item.description) : undefined,
|
||||
content: item['content:encoded'] ? this.cleanText(item['content:encoded']) : undefined,
|
||||
summary,
|
||||
content,
|
||||
imageUrl,
|
||||
};
|
||||
}
|
||||
|
||||
private extractImageUrl(enclosure: RssEnclosure | RssEnclosure[] | undefined): string | undefined {
|
||||
if (!enclosure) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
// Handle single enclosure or array of enclosures
|
||||
const enclosures = Array.isArray(enclosure) ? enclosure : [enclosure];
|
||||
|
||||
// Find first image enclosure
|
||||
for (const enc of enclosures) {
|
||||
const url = enc['@_url'];
|
||||
const type = enc['@_type'];
|
||||
|
||||
if (url) {
|
||||
// Check if it's an image type (image/*)
|
||||
if (!type || type.startsWith('image/')) {
|
||||
return url;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return undefined;
|
||||
}
|
||||
|
||||
private looksLikeFullContent(text: string): boolean {
|
||||
// Check if text looks like full HTML content rather than a brief summary
|
||||
// Indicators: contains HTML tags, is quite long, has multiple paragraphs
|
||||
if (!text) return false;
|
||||
|
||||
// Check for common HTML tags that indicate full content
|
||||
const hasHtmlTags = /<(p|div|br|h[1-6]|ul|ol|li|img|a|strong|em|blockquote)[\s>]/i.test(text);
|
||||
|
||||
// Check for substantial length (more than 500 chars suggests full content)
|
||||
const isLong = text.length > 500;
|
||||
|
||||
// Check for multiple paragraphs or line breaks
|
||||
const hasMultipleParagraphs = (text.match(/<p[\s>]/gi) || []).length >= 2 ||
|
||||
text.split(/\n\n|<br\s*\/?>/i).length >= 3;
|
||||
|
||||
return hasHtmlTags && (isLong || hasMultipleParagraphs);
|
||||
}
|
||||
|
||||
private extractSummary(content: string): string {
|
||||
// Extract a summary from full HTML content
|
||||
// Try to get the first paragraph or first 200 chars
|
||||
if (!content) return '';
|
||||
|
||||
// Remove HTML tags
|
||||
const textOnly = content.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim();
|
||||
|
||||
// Get first 200 characters, ending at a word boundary
|
||||
if (textOnly.length <= 200) {
|
||||
return textOnly;
|
||||
}
|
||||
|
||||
const truncated = textOnly.substring(0, 200);
|
||||
const lastSpace = truncated.lastIndexOf(' ');
|
||||
return truncated.substring(0, lastSpace) + '...';
|
||||
}
|
||||
|
||||
private cleanText(text: string): string {
|
||||
if (!text) return '';
|
||||
// Remove CDATA wrappers if present
|
||||
|
||||
@ -136,6 +136,48 @@ describe('SqlStorage', () => {
|
||||
expect(recent[0].url).toBe('https://example.com/article'); // unchanged
|
||||
expect(recent[0].publishedAt).toEqual(new Date('2024-09-06T09:00:00Z')); // unchanged
|
||||
});
|
||||
|
||||
it('saves and retrieves imageUrl', async () => {
|
||||
const item: FeedItem = {
|
||||
id: 'image-test',
|
||||
source: 'https://example.com/feed.xml',
|
||||
title: 'Article with Image',
|
||||
url: 'https://example.com/article',
|
||||
publishedAt: new Date('2024-09-06T09:00:00Z'),
|
||||
summary: 'Summary',
|
||||
imageUrl: 'https://example.com/image.jpg',
|
||||
};
|
||||
|
||||
await storage.save([item]);
|
||||
|
||||
const recent = await storage.getRecent(1);
|
||||
expect(recent).toHaveLength(1);
|
||||
expect(recent[0].imageUrl).toBe('https://example.com/image.jpg');
|
||||
});
|
||||
|
||||
it('updates imageUrl on upsert', async () => {
|
||||
const item: FeedItem = {
|
||||
id: 'image-update-test',
|
||||
source: 'https://example.com/feed.xml',
|
||||
title: 'Article',
|
||||
url: 'https://example.com/article',
|
||||
publishedAt: new Date('2024-09-06T09:00:00Z'),
|
||||
imageUrl: 'https://example.com/old-image.jpg',
|
||||
};
|
||||
|
||||
await storage.save([item]);
|
||||
|
||||
// Update with new image URL
|
||||
const updatedItem: FeedItem = {
|
||||
...item,
|
||||
imageUrl: 'https://example.com/new-image.jpg',
|
||||
};
|
||||
|
||||
await storage.save([updatedItem]);
|
||||
|
||||
const recent = await storage.getRecent(1);
|
||||
expect(recent[0].imageUrl).toBe('https://example.com/new-image.jpg');
|
||||
});
|
||||
});
|
||||
|
||||
describe('getRecent', () => {
|
||||
|
||||
@ -28,6 +28,7 @@ export class SqlStorage implements IStorage {
|
||||
published_at: item.publishedAt.toISOString(),
|
||||
content: item.content ?? null,
|
||||
summary: item.summary ?? null,
|
||||
image_url: item.imageUrl ?? null,
|
||||
created_at: new Date().toISOString(),
|
||||
}));
|
||||
|
||||
@ -40,6 +41,7 @@ export class SqlStorage implements IStorage {
|
||||
title: (eb) => eb.ref('excluded.title'),
|
||||
content: (eb) => eb.ref('excluded.content'),
|
||||
summary: (eb) => eb.ref('excluded.summary'),
|
||||
image_url: (eb) => eb.ref('excluded.image_url'),
|
||||
})
|
||||
)
|
||||
.execute();
|
||||
@ -96,6 +98,7 @@ export class SqlStorage implements IStorage {
|
||||
publishedAt: new Date(row.published_at),
|
||||
content: row.content ?? undefined,
|
||||
summary: row.summary ?? undefined,
|
||||
imageUrl: row.image_url ?? undefined,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
10
package-lock.json
generated
10
package-lock.json
generated
@ -9,6 +9,7 @@
|
||||
"version": "0.1.0",
|
||||
"dependencies": {
|
||||
"better-sqlite3": "^12.9.0",
|
||||
"commander": "^14.0.3",
|
||||
"fast-xml-parser": "^5.7.3",
|
||||
"kysely": "^0.28.17",
|
||||
"pg": "^8.20.0",
|
||||
@ -1117,6 +1118,15 @@
|
||||
"integrity": "sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==",
|
||||
"license": "ISC"
|
||||
},
|
||||
"node_modules/commander": {
|
||||
"version": "14.0.3",
|
||||
"resolved": "https://registry.npmjs.org/commander/-/commander-14.0.3.tgz",
|
||||
"integrity": "sha512-H+y0Jo/T1RZ9qPP4Eh1pkcQcLRglraJaSLoyOtHxu6AapkjWVCy2Sit1QQ4x3Dng8qDlSsZEet7g5Pq06MvTgw==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=20"
|
||||
}
|
||||
},
|
||||
"node_modules/debug": {
|
||||
"version": "4.4.3",
|
||||
"resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz",
|
||||
|
||||
@ -3,9 +3,14 @@
|
||||
"version": "0.1.0",
|
||||
"description": "RSS feed aggregator",
|
||||
"type": "module",
|
||||
"bin": {
|
||||
"pulse": "./cli.ts"
|
||||
},
|
||||
"scripts": {
|
||||
"test": "vitest run",
|
||||
"typecheck": "tsc --noEmit"
|
||||
"typecheck": "tsc --noEmit",
|
||||
"start": "tsx cli.ts start",
|
||||
"cli": "tsx cli.ts"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/better-sqlite3": "^7.6.13",
|
||||
@ -17,6 +22,7 @@
|
||||
},
|
||||
"dependencies": {
|
||||
"better-sqlite3": "^12.9.0",
|
||||
"commander": "^13.0.0",
|
||||
"fast-xml-parser": "^5.7.3",
|
||||
"kysely": "^0.28.17",
|
||||
"pg": "^8.20.0",
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user