- Create separate RssParser and AtomParser implementing IParser interface - Add utility functions for ID generation (djb2 hash) and date parsing - Support both RSS (RFC 822) and Atom (ISO 8601) date formats - Handle Atom elements with attributes (type="html") via #text property - Map RSS <description> to summary and <content:encoded> to content - Map Atom <summary> to summary and <content> to content - Prefer Atom link[@rel="alternate"] for article URLs - Throw descriptive errors for malformed XML and missing required fields - Add comprehensive test coverage for both parsers (32 tests total)
111 lines
2.6 KiB
TypeScript
111 lines
2.6 KiB
TypeScript
import type { FeedItem } from '../../interfaces/feed.types.js';
|
|
import type { IParser } from '../../interfaces/parser.interface.js';
|
|
import { XMLParser } from 'fast-xml-parser';
|
|
import { generateId, parseDate, isValidXml } from './utils.js';
|
|
|
|
interface RssChannel {
|
|
title?: string;
|
|
link?: string;
|
|
description?: string;
|
|
}
|
|
|
|
interface RssItem {
|
|
title?: string;
|
|
link?: string;
|
|
description?: string;
|
|
'content:encoded'?: string;
|
|
pubDate?: string;
|
|
guid?: string;
|
|
}
|
|
|
|
interface RssFeed {
|
|
rss?: {
|
|
channel?: {
|
|
item?: RssItem[] | RssItem;
|
|
};
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Parser for RSS 2.0 feeds.
|
|
*/
|
|
export class RssParser implements IParser {
|
|
private xmlParser: XMLParser;
|
|
|
|
constructor() {
|
|
this.xmlParser = new XMLParser({
|
|
ignoreAttributes: false,
|
|
attributeNamePrefix: '@_',
|
|
parseAttributeValue: false,
|
|
trimValues: true,
|
|
});
|
|
}
|
|
|
|
async parse(xml: string, source: string): Promise<FeedItem[]> {
|
|
if (!isValidXml(xml)) {
|
|
throw new Error('Invalid XML: does not appear to be valid RSS/XML');
|
|
}
|
|
|
|
let parsed: RssFeed;
|
|
try {
|
|
parsed = this.xmlParser.parse(xml) as RssFeed;
|
|
} catch (error) {
|
|
throw new Error(`XML parsing failed: ${(error as Error).message}`);
|
|
}
|
|
|
|
if (!parsed.rss) {
|
|
throw new Error('Invalid RSS: missing <rss> root element');
|
|
}
|
|
|
|
const channel = parsed.rss.channel;
|
|
if (!channel) {
|
|
throw new Error('Invalid RSS: missing <channel> element');
|
|
}
|
|
|
|
const items = channel.item;
|
|
if (!items) {
|
|
return [];
|
|
}
|
|
|
|
const itemArray = Array.isArray(items) ? items : [items];
|
|
|
|
return itemArray.map((item) => this.parseItem(item, source));
|
|
}
|
|
|
|
private parseItem(item: RssItem, source: string): FeedItem {
|
|
if (!item.title) {
|
|
throw new Error('RSS item missing required field: title');
|
|
}
|
|
|
|
if (!item.link) {
|
|
throw new Error('RSS item missing required field: link');
|
|
}
|
|
|
|
const publishedAt = item.pubDate ? parseDate(item.pubDate) : new Date();
|
|
const url = item.link;
|
|
|
|
return {
|
|
id: generateId(url, publishedAt),
|
|
source,
|
|
title: this.cleanText(item.title),
|
|
url,
|
|
publishedAt,
|
|
summary: item.description ? this.cleanText(item.description) : undefined,
|
|
content: item['content:encoded'] ? this.cleanText(item['content:encoded']) : undefined,
|
|
};
|
|
}
|
|
|
|
private cleanText(text: string): string {
|
|
if (!text) return '';
|
|
// Remove CDATA wrappers if present
|
|
return text
|
|
.replace(/^<!\[CDATA\[/, '')
|
|
.replace(/\]\]>$/, '')
|
|
.trim();
|
|
}
|
|
|
|
supports(contentType: string): boolean {
|
|
return contentType.toLowerCase().includes('rss');
|
|
}
|
|
}
|