Features: - Add CLI with commands: start, add, remove, list, fetch, status, items - Auto-detect RSS format when adding feeds - Auto-run database migrations on startup - Extract full HTML content from RSS description field (NOS-style feeds) - Extract image URLs from RSS enclosure tags - Display images in terminal output with emoji - Include imageUrl in JSON formatter output Database: - Add image_url column to feed_items table - Update storage layer to persist imageUrl field Tests: - Add 10 CLI integration tests - Add 3 RSS parser tests for image/content extraction - Add 2 storage tests for imageUrl persistence Dependencies: - Add commander for CLI framework All 144 tests passing
212 lines
6.0 KiB
TypeScript
212 lines
6.0 KiB
TypeScript
import type { FeedItem } from '../../interfaces/feed.types.js';
|
|
import type { IParser } from '../../interfaces/parser.interface.js';
|
|
import { XMLParser } from 'fast-xml-parser';
|
|
import { generateId, parseDate, isValidXml } from './utils.js';
|
|
|
|
interface RssChannel {
|
|
title?: string;
|
|
link?: string;
|
|
description?: string;
|
|
}
|
|
|
|
interface RssEnclosure {
|
|
'@_url'?: string;
|
|
'@_type'?: string;
|
|
'@_length'?: string;
|
|
}
|
|
|
|
interface RssItem {
|
|
title?: string;
|
|
link?: string;
|
|
description?: string;
|
|
'content:encoded'?: string;
|
|
pubDate?: string;
|
|
guid?: string;
|
|
enclosure?: RssEnclosure | RssEnclosure[];
|
|
}
|
|
|
|
interface RssFeed {
|
|
rss?: {
|
|
channel?: {
|
|
item?: RssItem[] | RssItem;
|
|
};
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Parser for RSS 2.0 feeds.
|
|
*/
|
|
export class RssParser implements IParser {
|
|
private xmlParser: XMLParser;
|
|
|
|
constructor() {
|
|
this.xmlParser = new XMLParser({
|
|
ignoreAttributes: false,
|
|
attributeNamePrefix: '@_',
|
|
parseAttributeValue: false,
|
|
trimValues: true,
|
|
});
|
|
}
|
|
|
|
async parse(xml: string, source: string): Promise<FeedItem[]> {
|
|
if (!isValidXml(xml)) {
|
|
throw new Error('Invalid XML: does not appear to be valid RSS/XML');
|
|
}
|
|
|
|
let parsed: RssFeed;
|
|
try {
|
|
parsed = this.xmlParser.parse(xml) as RssFeed;
|
|
} catch (error) {
|
|
throw new Error(`XML parsing failed: ${(error as Error).message}`);
|
|
}
|
|
|
|
if (!parsed.rss) {
|
|
throw new Error('Invalid RSS: missing <rss> root element');
|
|
}
|
|
|
|
const channel = parsed.rss.channel;
|
|
if (!channel) {
|
|
throw new Error('Invalid RSS: missing <channel> element');
|
|
}
|
|
|
|
const items = channel.item;
|
|
if (!items) {
|
|
return [];
|
|
}
|
|
|
|
const itemArray = Array.isArray(items) ? items : [items];
|
|
|
|
return itemArray.map((item) => this.parseItem(item, source));
|
|
}
|
|
|
|
private parseItem(item: RssItem, source: string): FeedItem {
|
|
if (!item.title) {
|
|
throw new Error('RSS item missing required field: title');
|
|
}
|
|
|
|
if (!item.link) {
|
|
throw new Error('RSS item missing required field: link');
|
|
}
|
|
|
|
const publishedAt = item.pubDate ? parseDate(item.pubDate) : new Date();
|
|
const url = item.link;
|
|
|
|
// Handle content extraction
|
|
// Some feeds use content:encoded for full content
|
|
// Others put full content in description (like NOS)
|
|
const hasContentEncoded = !!item['content:encoded'];
|
|
const description = item.description ? this.cleanText(item.description) : undefined;
|
|
const contentEncoded = item['content:encoded'] ? this.cleanText(item['content:encoded']) : undefined;
|
|
|
|
// If there's content:encoded, use it as content and description as summary
|
|
// If no content:encoded but description looks like full content (contains HTML), use it as content
|
|
// Otherwise description is just a summary
|
|
const descriptionLooksLikeContent = description && this.looksLikeFullContent(description);
|
|
|
|
let content: string | undefined;
|
|
let summary: string | undefined;
|
|
|
|
if (contentEncoded) {
|
|
// Standard case: content:encoded has full content, description has summary
|
|
content = contentEncoded;
|
|
summary = description;
|
|
} else if (descriptionLooksLikeContent) {
|
|
// No content:encoded but description contains full HTML content
|
|
content = description;
|
|
// Extract first paragraph or truncate for summary
|
|
summary = this.extractSummary(description);
|
|
} else {
|
|
// Description is just a plain text summary
|
|
summary = description;
|
|
content = undefined;
|
|
}
|
|
|
|
// Extract image URL from enclosure
|
|
const imageUrl = this.extractImageUrl(item.enclosure);
|
|
|
|
return {
|
|
id: generateId(url, publishedAt),
|
|
source,
|
|
title: this.cleanText(item.title),
|
|
url,
|
|
publishedAt,
|
|
summary,
|
|
content,
|
|
imageUrl,
|
|
};
|
|
}
|
|
|
|
private extractImageUrl(enclosure: RssEnclosure | RssEnclosure[] | undefined): string | undefined {
|
|
if (!enclosure) {
|
|
return undefined;
|
|
}
|
|
|
|
// Handle single enclosure or array of enclosures
|
|
const enclosures = Array.isArray(enclosure) ? enclosure : [enclosure];
|
|
|
|
// Find first image enclosure
|
|
for (const enc of enclosures) {
|
|
const url = enc['@_url'];
|
|
const type = enc['@_type'];
|
|
|
|
if (url) {
|
|
// Check if it's an image type (image/*)
|
|
if (!type || type.startsWith('image/')) {
|
|
return url;
|
|
}
|
|
}
|
|
}
|
|
|
|
return undefined;
|
|
}
|
|
|
|
private looksLikeFullContent(text: string): boolean {
|
|
// Check if text looks like full HTML content rather than a brief summary
|
|
// Indicators: contains HTML tags, is quite long, has multiple paragraphs
|
|
if (!text) return false;
|
|
|
|
// Check for common HTML tags that indicate full content
|
|
const hasHtmlTags = /<(p|div|br|h[1-6]|ul|ol|li|img|a|strong|em|blockquote)[\s>]/i.test(text);
|
|
|
|
// Check for substantial length (more than 500 chars suggests full content)
|
|
const isLong = text.length > 500;
|
|
|
|
// Check for multiple paragraphs or line breaks
|
|
const hasMultipleParagraphs = (text.match(/<p[\s>]/gi) || []).length >= 2 ||
|
|
text.split(/\n\n|<br\s*\/?>/i).length >= 3;
|
|
|
|
return hasHtmlTags && (isLong || hasMultipleParagraphs);
|
|
}
|
|
|
|
private extractSummary(content: string): string {
|
|
// Extract a summary from full HTML content
|
|
// Try to get the first paragraph or first 200 chars
|
|
if (!content) return '';
|
|
|
|
// Remove HTML tags
|
|
const textOnly = content.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim();
|
|
|
|
// Get first 200 characters, ending at a word boundary
|
|
if (textOnly.length <= 200) {
|
|
return textOnly;
|
|
}
|
|
|
|
const truncated = textOnly.substring(0, 200);
|
|
const lastSpace = truncated.lastIndexOf(' ');
|
|
return truncated.substring(0, lastSpace) + '...';
|
|
}
|
|
|
|
private cleanText(text: string): string {
|
|
if (!text) return '';
|
|
// Remove CDATA wrappers if present
|
|
return text
|
|
.replace(/^<!\[CDATA\[/, '')
|
|
.replace(/\]\]>$/, '')
|
|
.trim();
|
|
}
|
|
|
|
supports(contentType: string): boolean {
|
|
return contentType.toLowerCase().includes('rss');
|
|
}
|
|
}
|