pulse/modules/parser/atom.parser.ts
Edo Limburg 2ea1f5cd95 Add parser module with RSS and Atom feed parsers
- Create separate RssParser and AtomParser implementing IParser interface
- Add utility functions for ID generation (djb2 hash) and date parsing
- Support both RSS (RFC 822) and Atom (ISO 8601) date formats
- Handle Atom elements with attributes (type="html") via #text property
- Map RSS <description> to summary and <content:encoded> to content
- Map Atom <summary> to summary and <content> to content
- Prefer Atom link[@rel="alternate"] for article URLs
- Throw descriptive errors for malformed XML and missing required fields
- Add comprehensive test coverage for both parsers (32 tests total)
2026-05-05 21:15:20 +02:00

153 lines
3.9 KiB
TypeScript

import type { FeedItem } from '../../interfaces/feed.types.js';
import type { IParser } from '../../interfaces/parser.interface.js';
import { XMLParser } from 'fast-xml-parser';
import { generateId, parseDate, isValidXml } from './utils.js';
interface AtomLink {
'@_href'?: string;
'@_rel'?: string;
}
interface AtomTextField {
'#text'?: string;
'@_type'?: string;
}
interface AtomEntry {
title?: string | AtomTextField;
link?: AtomLink[] | AtomLink;
summary?: string | AtomTextField;
content?: string | AtomTextField;
published?: string;
updated?: string;
id?: string;
}
interface AtomFeed {
feed?: {
entry?: AtomEntry[] | AtomEntry;
};
}
/**
* Parser for Atom 1.0 feeds.
*/
export class AtomParser implements IParser {
private xmlParser: XMLParser;
constructor() {
this.xmlParser = new XMLParser({
ignoreAttributes: false,
attributeNamePrefix: '@_',
parseAttributeValue: false,
trimValues: true,
});
}
async parse(xml: string, source: string): Promise<FeedItem[]> {
if (!isValidXml(xml)) {
throw new Error('Invalid XML: does not appear to be valid Atom/XML');
}
let parsed: AtomFeed;
try {
parsed = this.xmlParser.parse(xml) as AtomFeed;
} catch (error) {
throw new Error(`XML parsing failed: ${(error as Error).message}`);
}
if (!parsed.feed) {
throw new Error('Invalid Atom: missing <feed> root element');
}
const entries = parsed.feed.entry;
if (!entries) {
return [];
}
const entryArray = Array.isArray(entries) ? entries : [entries];
return entryArray.map((entry) => this.parseEntry(entry, source));
}
private parseEntry(entry: AtomEntry, source: string): FeedItem {
if (!entry.title) {
throw new Error('Atom entry missing required field: title');
}
const url = this.extractUrl(entry);
if (!url) {
throw new Error('Atom entry missing required field: link with href');
}
const publishedAt = this.extractDate(entry);
return {
id: generateId(url, publishedAt),
source,
title: this.cleanText(entry.title),
url,
publishedAt,
summary: entry.summary ? this.cleanText(entry.summary) : undefined,
content: entry.content ? this.cleanText(entry.content) : undefined,
};
}
private extractUrl(entry: AtomEntry): string | undefined {
if (!entry.link) {
return undefined;
}
const links = Array.isArray(entry.link) ? entry.link : [entry.link];
// Prefer rel="alternate" (the actual article link)
const alternate = links.find(l => l['@_rel'] === 'alternate');
if (alternate?.['@_href']) {
return alternate['@_href'];
}
// Fallback to first link with href, skip rel="self"
const firstLink = links.find(l => l['@_href'] && l['@_rel'] !== 'self');
if (firstLink?.['@_href']) {
return firstLink['@_href'];
}
// Last resort: any href
const anyLink = links.find(l => l['@_href']);
return anyLink?.['@_href'];
}
private extractDate(entry: AtomEntry): Date {
// Prefer <published>, fallback to <updated>
if (entry.published) {
return parseDate(entry.published);
}
if (entry.updated) {
return parseDate(entry.updated);
}
return new Date();
}
private cleanText(text: string | AtomTextField | undefined): string {
if (!text) return '';
// Handle object with #text property (when element has attributes like type="html")
const textValue = typeof text === 'string' ? text : text['#text'] || '';
// Remove CDATA wrappers if present
return textValue
.replace(/^<!\[CDATA\[/, '')
.replace(/\]\]>$/, '')
.trim();
}
private extractTitle(entry: AtomEntry): string {
if (!entry.title) {
throw new Error('Atom entry missing required field: title');
}
return this.cleanText(entry.title);
}
supports(contentType: string): boolean {
return contentType.toLowerCase().includes('atom');
}
}