pulse/modules/parser/rss.parser.ts
Edo Limburg c79eb6d76d Add CLI entry point, RSS content extraction, and image support
Features:
- Add CLI with commands: start, add, remove, list, fetch, status, items
- Auto-detect RSS format when adding feeds
- Auto-run database migrations on startup
- Extract full HTML content from RSS description field (NOS-style feeds)
- Extract image URLs from RSS enclosure tags
- Display images in terminal output with emoji
- Include imageUrl in JSON formatter output

Database:
- Add image_url column to feed_items table
- Update storage layer to persist imageUrl field

Tests:
- Add 10 CLI integration tests
- Add 3 RSS parser tests for image/content extraction
- Add 2 storage tests for imageUrl persistence

Dependencies:
- Add commander for CLI framework

All 144 tests passing
2026-05-05 23:05:30 +02:00

212 lines
6.0 KiB
TypeScript

import type { FeedItem } from '../../interfaces/feed.types.js';
import type { IParser } from '../../interfaces/parser.interface.js';
import { XMLParser } from 'fast-xml-parser';
import { generateId, parseDate, isValidXml } from './utils.js';
interface RssChannel {
title?: string;
link?: string;
description?: string;
}
interface RssEnclosure {
'@_url'?: string;
'@_type'?: string;
'@_length'?: string;
}
interface RssItem {
title?: string;
link?: string;
description?: string;
'content:encoded'?: string;
pubDate?: string;
guid?: string;
enclosure?: RssEnclosure | RssEnclosure[];
}
interface RssFeed {
rss?: {
channel?: {
item?: RssItem[] | RssItem;
};
};
}
/**
* Parser for RSS 2.0 feeds.
*/
export class RssParser implements IParser {
private xmlParser: XMLParser;
constructor() {
this.xmlParser = new XMLParser({
ignoreAttributes: false,
attributeNamePrefix: '@_',
parseAttributeValue: false,
trimValues: true,
});
}
async parse(xml: string, source: string): Promise<FeedItem[]> {
if (!isValidXml(xml)) {
throw new Error('Invalid XML: does not appear to be valid RSS/XML');
}
let parsed: RssFeed;
try {
parsed = this.xmlParser.parse(xml) as RssFeed;
} catch (error) {
throw new Error(`XML parsing failed: ${(error as Error).message}`);
}
if (!parsed.rss) {
throw new Error('Invalid RSS: missing <rss> root element');
}
const channel = parsed.rss.channel;
if (!channel) {
throw new Error('Invalid RSS: missing <channel> element');
}
const items = channel.item;
if (!items) {
return [];
}
const itemArray = Array.isArray(items) ? items : [items];
return itemArray.map((item) => this.parseItem(item, source));
}
private parseItem(item: RssItem, source: string): FeedItem {
if (!item.title) {
throw new Error('RSS item missing required field: title');
}
if (!item.link) {
throw new Error('RSS item missing required field: link');
}
const publishedAt = item.pubDate ? parseDate(item.pubDate) : new Date();
const url = item.link;
// Handle content extraction
// Some feeds use content:encoded for full content
// Others put full content in description (like NOS)
const hasContentEncoded = !!item['content:encoded'];
const description = item.description ? this.cleanText(item.description) : undefined;
const contentEncoded = item['content:encoded'] ? this.cleanText(item['content:encoded']) : undefined;
// If there's content:encoded, use it as content and description as summary
// If no content:encoded but description looks like full content (contains HTML), use it as content
// Otherwise description is just a summary
const descriptionLooksLikeContent = description && this.looksLikeFullContent(description);
let content: string | undefined;
let summary: string | undefined;
if (contentEncoded) {
// Standard case: content:encoded has full content, description has summary
content = contentEncoded;
summary = description;
} else if (descriptionLooksLikeContent) {
// No content:encoded but description contains full HTML content
content = description;
// Extract first paragraph or truncate for summary
summary = this.extractSummary(description);
} else {
// Description is just a plain text summary
summary = description;
content = undefined;
}
// Extract image URL from enclosure
const imageUrl = this.extractImageUrl(item.enclosure);
return {
id: generateId(url, publishedAt),
source,
title: this.cleanText(item.title),
url,
publishedAt,
summary,
content,
imageUrl,
};
}
private extractImageUrl(enclosure: RssEnclosure | RssEnclosure[] | undefined): string | undefined {
if (!enclosure) {
return undefined;
}
// Handle single enclosure or array of enclosures
const enclosures = Array.isArray(enclosure) ? enclosure : [enclosure];
// Find first image enclosure
for (const enc of enclosures) {
const url = enc['@_url'];
const type = enc['@_type'];
if (url) {
// Check if it's an image type (image/*)
if (!type || type.startsWith('image/')) {
return url;
}
}
}
return undefined;
}
private looksLikeFullContent(text: string): boolean {
// Check if text looks like full HTML content rather than a brief summary
// Indicators: contains HTML tags, is quite long, has multiple paragraphs
if (!text) return false;
// Check for common HTML tags that indicate full content
const hasHtmlTags = /<(p|div|br|h[1-6]|ul|ol|li|img|a|strong|em|blockquote)[\s>]/i.test(text);
// Check for substantial length (more than 500 chars suggests full content)
const isLong = text.length > 500;
// Check for multiple paragraphs or line breaks
const hasMultipleParagraphs = (text.match(/<p[\s>]/gi) || []).length >= 2 ||
text.split(/\n\n|<br\s*\/?>/i).length >= 3;
return hasHtmlTags && (isLong || hasMultipleParagraphs);
}
private extractSummary(content: string): string {
// Extract a summary from full HTML content
// Try to get the first paragraph or first 200 chars
if (!content) return '';
// Remove HTML tags
const textOnly = content.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim();
// Get first 200 characters, ending at a word boundary
if (textOnly.length <= 200) {
return textOnly;
}
const truncated = textOnly.substring(0, 200);
const lastSpace = truncated.lastIndexOf(' ');
return truncated.substring(0, lastSpace) + '...';
}
private cleanText(text: string): string {
if (!text) return '';
// Remove CDATA wrappers if present
return text
.replace(/^<!\[CDATA\[/, '')
.replace(/\]\]>$/, '')
.trim();
}
supports(contentType: string): boolean {
return contentType.toLowerCase().includes('rss');
}
}