import type { FeedItem } from '../../interfaces/feed.types.js'; import type { IParser } from '../../interfaces/parser.interface.js'; import { XMLParser } from 'fast-xml-parser'; import { generateId, parseDate, isValidXml } from './utils.js'; interface RssChannel { title?: string; link?: string; description?: string; } interface RssEnclosure { '@_url'?: string; '@_type'?: string; '@_length'?: string; } interface RssItem { title?: string; link?: string; description?: string; 'content:encoded'?: string; pubDate?: string; guid?: string; enclosure?: RssEnclosure | RssEnclosure[]; } interface RssFeed { rss?: { channel?: { item?: RssItem[] | RssItem; }; }; } /** * Parser for RSS 2.0 feeds. */ export class RssParser implements IParser { private xmlParser: XMLParser; constructor() { this.xmlParser = new XMLParser({ ignoreAttributes: false, attributeNamePrefix: '@_', parseAttributeValue: false, trimValues: true, }); } async parse(xml: string, source: string): Promise { if (!isValidXml(xml)) { throw new Error('Invalid XML: does not appear to be valid RSS/XML'); } let parsed: RssFeed; try { parsed = this.xmlParser.parse(xml) as RssFeed; } catch (error) { throw new Error(`XML parsing failed: ${(error as Error).message}`); } if (!parsed.rss) { throw new Error('Invalid RSS: missing root element'); } const channel = parsed.rss.channel; if (!channel) { throw new Error('Invalid RSS: missing element'); } const items = channel.item; if (!items) { return []; } const itemArray = Array.isArray(items) ? items : [items]; return itemArray.map((item) => this.parseItem(item, source)); } private parseItem(item: RssItem, source: string): FeedItem { if (!item.title) { throw new Error('RSS item missing required field: title'); } if (!item.link) { throw new Error('RSS item missing required field: link'); } const publishedAt = item.pubDate ? parseDate(item.pubDate) : new Date(); const url = item.link; // Handle content extraction // Some feeds use content:encoded for full content // Others put full content in description (like NOS) const hasContentEncoded = !!item['content:encoded']; const description = item.description ? this.cleanText(item.description) : undefined; const contentEncoded = item['content:encoded'] ? this.cleanText(item['content:encoded']) : undefined; // If there's content:encoded, use it as content and description as summary // If no content:encoded but description looks like full content (contains HTML), use it as content // Otherwise description is just a summary const descriptionLooksLikeContent = description && this.looksLikeFullContent(description); let content: string | undefined; let summary: string | undefined; if (contentEncoded) { // Standard case: content:encoded has full content, description has summary content = contentEncoded; summary = description; } else if (descriptionLooksLikeContent) { // No content:encoded but description contains full HTML content content = description; // Extract first paragraph or truncate for summary summary = this.extractSummary(description); } else { // Description is just a plain text summary summary = description; content = undefined; } // Extract image URL from enclosure const imageUrl = this.extractImageUrl(item.enclosure); return { id: generateId(url, publishedAt), source, title: this.cleanText(item.title), url, publishedAt, summary, content, imageUrl, }; } private extractImageUrl(enclosure: RssEnclosure | RssEnclosure[] | undefined): string | undefined { if (!enclosure) { return undefined; } // Handle single enclosure or array of enclosures const enclosures = Array.isArray(enclosure) ? enclosure : [enclosure]; // Find first image enclosure for (const enc of enclosures) { const url = enc['@_url']; const type = enc['@_type']; if (url) { // Check if it's an image type (image/*) if (!type || type.startsWith('image/')) { return url; } } } return undefined; } private looksLikeFullContent(text: string): boolean { // Check if text looks like full HTML content rather than a brief summary // Indicators: contains HTML tags, is quite long, has multiple paragraphs if (!text) return false; // Check for common HTML tags that indicate full content const hasHtmlTags = /<(p|div|br|h[1-6]|ul|ol|li|img|a|strong|em|blockquote)[\s>]/i.test(text); // Check for substantial length (more than 500 chars suggests full content) const isLong = text.length > 500; // Check for multiple paragraphs or line breaks const hasMultipleParagraphs = (text.match(/]/gi) || []).length >= 2 || text.split(/\n\n|/i).length >= 3; return hasHtmlTags && (isLong || hasMultipleParagraphs); } private extractSummary(content: string): string { // Extract a summary from full HTML content // Try to get the first paragraph or first 200 chars if (!content) return ''; // Remove HTML tags const textOnly = content.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim(); // Get first 200 characters, ending at a word boundary if (textOnly.length <= 200) { return textOnly; } const truncated = textOnly.substring(0, 200); const lastSpace = truncated.lastIndexOf(' '); return truncated.substring(0, lastSpace) + '...'; } private cleanText(text: string): string { if (!text) return ''; // Remove CDATA wrappers if present return text .replace(/^$/, '') .trim(); } supports(contentType: string): boolean { return contentType.toLowerCase().includes('rss'); } }