pulse/modules/parser/rss.parser.ts
Edo Limburg 2ea1f5cd95 Add parser module with RSS and Atom feed parsers
- Create separate RssParser and AtomParser implementing IParser interface
- Add utility functions for ID generation (djb2 hash) and date parsing
- Support both RSS (RFC 822) and Atom (ISO 8601) date formats
- Handle Atom elements with attributes (type="html") via #text property
- Map RSS <description> to summary and <content:encoded> to content
- Map Atom <summary> to summary and <content> to content
- Prefer Atom link[@rel="alternate"] for article URLs
- Throw descriptive errors for malformed XML and missing required fields
- Add comprehensive test coverage for both parsers (32 tests total)
2026-05-05 21:15:20 +02:00

111 lines
2.6 KiB
TypeScript

import type { FeedItem } from '../../interfaces/feed.types.js';
import type { IParser } from '../../interfaces/parser.interface.js';
import { XMLParser } from 'fast-xml-parser';
import { generateId, parseDate, isValidXml } from './utils.js';
interface RssChannel {
title?: string;
link?: string;
description?: string;
}
interface RssItem {
title?: string;
link?: string;
description?: string;
'content:encoded'?: string;
pubDate?: string;
guid?: string;
}
interface RssFeed {
rss?: {
channel?: {
item?: RssItem[] | RssItem;
};
};
}
/**
* Parser for RSS 2.0 feeds.
*/
export class RssParser implements IParser {
private xmlParser: XMLParser;
constructor() {
this.xmlParser = new XMLParser({
ignoreAttributes: false,
attributeNamePrefix: '@_',
parseAttributeValue: false,
trimValues: true,
});
}
async parse(xml: string, source: string): Promise<FeedItem[]> {
if (!isValidXml(xml)) {
throw new Error('Invalid XML: does not appear to be valid RSS/XML');
}
let parsed: RssFeed;
try {
parsed = this.xmlParser.parse(xml) as RssFeed;
} catch (error) {
throw new Error(`XML parsing failed: ${(error as Error).message}`);
}
if (!parsed.rss) {
throw new Error('Invalid RSS: missing <rss> root element');
}
const channel = parsed.rss.channel;
if (!channel) {
throw new Error('Invalid RSS: missing <channel> element');
}
const items = channel.item;
if (!items) {
return [];
}
const itemArray = Array.isArray(items) ? items : [items];
return itemArray.map((item) => this.parseItem(item, source));
}
private parseItem(item: RssItem, source: string): FeedItem {
if (!item.title) {
throw new Error('RSS item missing required field: title');
}
if (!item.link) {
throw new Error('RSS item missing required field: link');
}
const publishedAt = item.pubDate ? parseDate(item.pubDate) : new Date();
const url = item.link;
return {
id: generateId(url, publishedAt),
source,
title: this.cleanText(item.title),
url,
publishedAt,
summary: item.description ? this.cleanText(item.description) : undefined,
content: item['content:encoded'] ? this.cleanText(item['content:encoded']) : undefined,
};
}
private cleanText(text: string): string {
if (!text) return '';
// Remove CDATA wrappers if present
return text
.replace(/^<!\[CDATA\[/, '')
.replace(/\]\]>$/, '')
.trim();
}
supports(contentType: string): boolean {
return contentType.toLowerCase().includes('rss');
}
}