import type { FeedItem } from '../../interfaces/feed.types.js'; import type { IParser } from '../../interfaces/parser.interface.js'; import { XMLParser } from 'fast-xml-parser'; import { generateId, parseDate, isValidXml } from './utils.js'; interface AtomLink { '@_href'?: string; '@_rel'?: string; } interface AtomTextField { '#text'?: string; '@_type'?: string; } interface AtomEntry { title?: string | AtomTextField; link?: AtomLink[] | AtomLink; summary?: string | AtomTextField; content?: string | AtomTextField; published?: string; updated?: string; id?: string; } interface AtomFeed { feed?: { entry?: AtomEntry[] | AtomEntry; }; } /** * Parser for Atom 1.0 feeds. */ export class AtomParser implements IParser { private xmlParser: XMLParser; constructor() { this.xmlParser = new XMLParser({ ignoreAttributes: false, attributeNamePrefix: '@_', parseAttributeValue: false, trimValues: true, }); } async parse(xml: string, source: string): Promise { if (!isValidXml(xml)) { throw new Error('Invalid XML: does not appear to be valid Atom/XML'); } let parsed: AtomFeed; try { parsed = this.xmlParser.parse(xml) as AtomFeed; } catch (error) { throw new Error(`XML parsing failed: ${(error as Error).message}`); } if (!parsed.feed) { throw new Error('Invalid Atom: missing root element'); } const entries = parsed.feed.entry; if (!entries) { return []; } const entryArray = Array.isArray(entries) ? entries : [entries]; return entryArray.map((entry) => this.parseEntry(entry, source)); } private parseEntry(entry: AtomEntry, source: string): FeedItem { if (!entry.title) { throw new Error('Atom entry missing required field: title'); } const url = this.extractUrl(entry); if (!url) { throw new Error('Atom entry missing required field: link with href'); } const publishedAt = this.extractDate(entry); return { id: generateId(url, publishedAt), source, title: this.cleanText(entry.title), url, publishedAt, summary: entry.summary ? this.cleanText(entry.summary) : undefined, content: entry.content ? this.cleanText(entry.content) : undefined, }; } private extractUrl(entry: AtomEntry): string | undefined { if (!entry.link) { return undefined; } const links = Array.isArray(entry.link) ? entry.link : [entry.link]; // Prefer rel="alternate" (the actual article link) const alternate = links.find(l => l['@_rel'] === 'alternate'); if (alternate?.['@_href']) { return alternate['@_href']; } // Fallback to first link with href, skip rel="self" const firstLink = links.find(l => l['@_href'] && l['@_rel'] !== 'self'); if (firstLink?.['@_href']) { return firstLink['@_href']; } // Last resort: any href const anyLink = links.find(l => l['@_href']); return anyLink?.['@_href']; } private extractDate(entry: AtomEntry): Date { // Prefer , fallback to if (entry.published) { return parseDate(entry.published); } if (entry.updated) { return parseDate(entry.updated); } return new Date(); } private cleanText(text: string | AtomTextField | undefined): string { if (!text) return ''; // Handle object with #text property (when element has attributes like type="html") const textValue = typeof text === 'string' ? text : text['#text'] || ''; // Remove CDATA wrappers if present return textValue .replace(/^$/, '') .trim(); } private extractTitle(entry: AtomEntry): string { if (!entry.title) { throw new Error('Atom entry missing required field: title'); } return this.cleanText(entry.title); } supports(contentType: string): boolean { return contentType.toLowerCase().includes('atom'); } }