- Create separate RssParser and AtomParser implementing IParser interface - Add utility functions for ID generation (djb2 hash) and date parsing - Support both RSS (RFC 822) and Atom (ISO 8601) date formats - Handle Atom elements with attributes (type="html") via #text property - Map RSS <description> to summary and <content:encoded> to content - Map Atom <summary> to summary and <content> to content - Prefer Atom link[@rel="alternate"] for article URLs - Throw descriptive errors for malformed XML and missing required fields - Add comprehensive test coverage for both parsers (32 tests total)
153 lines
3.9 KiB
TypeScript
153 lines
3.9 KiB
TypeScript
import type { FeedItem } from '../../interfaces/feed.types.js';
|
|
import type { IParser } from '../../interfaces/parser.interface.js';
|
|
import { XMLParser } from 'fast-xml-parser';
|
|
import { generateId, parseDate, isValidXml } from './utils.js';
|
|
|
|
interface AtomLink {
|
|
'@_href'?: string;
|
|
'@_rel'?: string;
|
|
}
|
|
|
|
interface AtomTextField {
|
|
'#text'?: string;
|
|
'@_type'?: string;
|
|
}
|
|
|
|
interface AtomEntry {
|
|
title?: string | AtomTextField;
|
|
link?: AtomLink[] | AtomLink;
|
|
summary?: string | AtomTextField;
|
|
content?: string | AtomTextField;
|
|
published?: string;
|
|
updated?: string;
|
|
id?: string;
|
|
}
|
|
|
|
interface AtomFeed {
|
|
feed?: {
|
|
entry?: AtomEntry[] | AtomEntry;
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Parser for Atom 1.0 feeds.
|
|
*/
|
|
export class AtomParser implements IParser {
|
|
private xmlParser: XMLParser;
|
|
|
|
constructor() {
|
|
this.xmlParser = new XMLParser({
|
|
ignoreAttributes: false,
|
|
attributeNamePrefix: '@_',
|
|
parseAttributeValue: false,
|
|
trimValues: true,
|
|
});
|
|
}
|
|
|
|
async parse(xml: string, source: string): Promise<FeedItem[]> {
|
|
if (!isValidXml(xml)) {
|
|
throw new Error('Invalid XML: does not appear to be valid Atom/XML');
|
|
}
|
|
|
|
let parsed: AtomFeed;
|
|
try {
|
|
parsed = this.xmlParser.parse(xml) as AtomFeed;
|
|
} catch (error) {
|
|
throw new Error(`XML parsing failed: ${(error as Error).message}`);
|
|
}
|
|
|
|
if (!parsed.feed) {
|
|
throw new Error('Invalid Atom: missing <feed> root element');
|
|
}
|
|
|
|
const entries = parsed.feed.entry;
|
|
if (!entries) {
|
|
return [];
|
|
}
|
|
|
|
const entryArray = Array.isArray(entries) ? entries : [entries];
|
|
|
|
return entryArray.map((entry) => this.parseEntry(entry, source));
|
|
}
|
|
|
|
private parseEntry(entry: AtomEntry, source: string): FeedItem {
|
|
if (!entry.title) {
|
|
throw new Error('Atom entry missing required field: title');
|
|
}
|
|
|
|
const url = this.extractUrl(entry);
|
|
if (!url) {
|
|
throw new Error('Atom entry missing required field: link with href');
|
|
}
|
|
|
|
const publishedAt = this.extractDate(entry);
|
|
|
|
return {
|
|
id: generateId(url, publishedAt),
|
|
source,
|
|
title: this.cleanText(entry.title),
|
|
url,
|
|
publishedAt,
|
|
summary: entry.summary ? this.cleanText(entry.summary) : undefined,
|
|
content: entry.content ? this.cleanText(entry.content) : undefined,
|
|
};
|
|
}
|
|
|
|
private extractUrl(entry: AtomEntry): string | undefined {
|
|
if (!entry.link) {
|
|
return undefined;
|
|
}
|
|
|
|
const links = Array.isArray(entry.link) ? entry.link : [entry.link];
|
|
|
|
// Prefer rel="alternate" (the actual article link)
|
|
const alternate = links.find(l => l['@_rel'] === 'alternate');
|
|
if (alternate?.['@_href']) {
|
|
return alternate['@_href'];
|
|
}
|
|
|
|
// Fallback to first link with href, skip rel="self"
|
|
const firstLink = links.find(l => l['@_href'] && l['@_rel'] !== 'self');
|
|
if (firstLink?.['@_href']) {
|
|
return firstLink['@_href'];
|
|
}
|
|
|
|
// Last resort: any href
|
|
const anyLink = links.find(l => l['@_href']);
|
|
return anyLink?.['@_href'];
|
|
}
|
|
|
|
private extractDate(entry: AtomEntry): Date {
|
|
// Prefer <published>, fallback to <updated>
|
|
if (entry.published) {
|
|
return parseDate(entry.published);
|
|
}
|
|
if (entry.updated) {
|
|
return parseDate(entry.updated);
|
|
}
|
|
return new Date();
|
|
}
|
|
|
|
private cleanText(text: string | AtomTextField | undefined): string {
|
|
if (!text) return '';
|
|
// Handle object with #text property (when element has attributes like type="html")
|
|
const textValue = typeof text === 'string' ? text : text['#text'] || '';
|
|
// Remove CDATA wrappers if present
|
|
return textValue
|
|
.replace(/^<!\[CDATA\[/, '')
|
|
.replace(/\]\]>$/, '')
|
|
.trim();
|
|
}
|
|
|
|
private extractTitle(entry: AtomEntry): string {
|
|
if (!entry.title) {
|
|
throw new Error('Atom entry missing required field: title');
|
|
}
|
|
return this.cleanText(entry.title);
|
|
}
|
|
|
|
supports(contentType: string): boolean {
|
|
return contentType.toLowerCase().includes('atom');
|
|
}
|
|
}
|