From 2ea1f5cd954bad8ccea42c6907646abb453b09be Mon Sep 17 00:00:00 2001 From: Edo Limburg Date: Tue, 5 May 2026 21:15:20 +0200 Subject: [PATCH] Add parser module with RSS and Atom feed parsers - Create separate RssParser and AtomParser implementing IParser interface - Add utility functions for ID generation (djb2 hash) and date parsing - Support both RSS (RFC 822) and Atom (ISO 8601) date formats - Handle Atom elements with attributes (type="html") via #text property - Map RSS to summary and to content - Map Atom to summary and to content - Prefer Atom link[@rel="alternate"] for article URLs - Throw descriptive errors for malformed XML and missing required fields - Add comprehensive test coverage for both parsers (32 tests total) --- modules/parser/atom.parser.test.ts | 236 +++++++++++++++++++++++++++++ modules/parser/atom.parser.ts | 152 +++++++++++++++++++ modules/parser/index.ts | 3 + modules/parser/rss.parser.test.ts | 192 +++++++++++++++++++++++ modules/parser/rss.parser.ts | 110 ++++++++++++++ modules/parser/utils.ts | 75 +++++++++ package-lock.json | 76 ++++++++++ package.json | 1 + 8 files changed, 845 insertions(+) create mode 100644 modules/parser/atom.parser.test.ts create mode 100644 modules/parser/atom.parser.ts create mode 100644 modules/parser/index.ts create mode 100644 modules/parser/rss.parser.test.ts create mode 100644 modules/parser/rss.parser.ts create mode 100644 modules/parser/utils.ts diff --git a/modules/parser/atom.parser.test.ts b/modules/parser/atom.parser.test.ts new file mode 100644 index 0000000..7826dcd --- /dev/null +++ b/modules/parser/atom.parser.test.ts @@ -0,0 +1,236 @@ +import { describe, it, expect } from 'vitest'; +import { AtomParser } from './atom.parser.js'; + +describe('AtomParser', () => { + const parser = new AtomParser(); + + describe('parse', () => { + it('parses valid Atom feed with all fields', async () => { + const xml = ` + + + Test Article + + + Full content

]]>
+ 2024-09-06T09:00:00Z +
+
`; + + const items = await parser.parse(xml, 'https://example.com/feed.xml'); + + expect(items).toHaveLength(1); + expect(items[0].title).toBe('Test Article'); + expect(items[0].url).toBe('https://example.com/article'); + expect(items[0].summary).toBe('This is a summary'); + expect(items[0].content).toBe('

Full content

'); + expect(items[0].publishedAt).toEqual(new Date('2024-09-06T09:00:00Z')); + expect(items[0].source).toBe('https://example.com/feed.xml'); + expect(items[0].id).toBeDefined(); + }); + + it('parses Atom with only required fields', async () => { + const xml = ` + + + Minimal Article + + + `; + + const items = await parser.parse(xml, 'https://example.com/feed.xml'); + + expect(items).toHaveLength(1); + expect(items[0].title).toBe('Minimal Article'); + expect(items[0].url).toBe('https://example.com/minimal'); + expect(items[0].summary).toBeUndefined(); + expect(items[0].content).toBeUndefined(); + expect(items[0].publishedAt).toBeInstanceOf(Date); + }); + + it('parses multiple entries', async () => { + const xml = ` + + + Article 1 + + 2024-09-06T09:00:00Z + + + Article 2 + + 2024-09-07T10:00:00Z + + `; + + const items = await parser.parse(xml, 'https://example.com/feed.xml'); + + expect(items).toHaveLength(2); + expect(items[0].title).toBe('Article 1'); + expect(items[1].title).toBe('Article 2'); + }); + + it('returns empty array when no entries', async () => { + const xml = ` + + Empty Feed + `; + + const items = await parser.parse(xml, 'https://example.com/feed.xml'); + + expect(items).toHaveLength(0); + }); + + it('prefers rel="alternate" link', async () => { + const xml = ` + + + Test + + + + `; + + const items = await parser.parse(xml, 'https://example.com/feed.xml'); + + expect(items[0].url).toBe('https://example.com/article'); + }); + + it('falls back to first non-self link', async () => { + const xml = ` + + + Test + + + + `; + + const items = await parser.parse(xml, 'https://example.com/feed.xml'); + + expect(items[0].url).toBe('https://example.com/article'); + }); + + it('throws on missing title', async () => { + const xml = ` + + + + + `; + + await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow( + 'missing required field: title' + ); + }); + + it('throws on missing link with href', async () => { + const xml = ` + + + Article Without Link + + `; + + await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow( + 'missing required field: link with href' + ); + }); + + it('throws on invalid XML', async () => { + const xml = 'not xml at all'; + + await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow( + 'Invalid XML' + ); + }); + + it('throws on missing feed root element', async () => { + const xml = ''; + + await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow( + 'missing root element' + ); + }); + + it('uses when is missing', async () => { + const xml = ` + + + Test + + 2024-09-06T09:00:00Z + + `; + + const items = await parser.parse(xml, 'https://example.com/feed.xml'); + + expect(items[0].publishedAt).toEqual(new Date('2024-09-06T09:00:00Z')); + }); + + it('prefers over ', async () => { + const xml = ` + + + Test + + 2024-09-06T09:00:00Z + 2024-09-07T10:00:00Z + + `; + + const items = await parser.parse(xml, 'https://example.com/feed.xml'); + + expect(items[0].publishedAt).toEqual(new Date('2024-09-06T09:00:00Z')); + }); + + it('generates deterministic IDs', async () => { + const xml = ` + + + Test + + 2024-09-06T09:00:00Z + + `; + + const items1 = await parser.parse(xml, 'https://example.com/feed.xml'); + const items2 = await parser.parse(xml, 'https://example.com/feed.xml'); + + expect(items1[0].id).toBe(items2[0].id); + }); + + it('handles multiple links in array format', async () => { + const xml = ` + + + Test + + + + `; + + const items = await parser.parse(xml, 'https://example.com/feed.xml'); + + expect(items[0].url).toBe('https://example.com/article'); + }); + }); + + describe('supports', () => { + it('returns true for application/atom+xml', () => { + expect(parser.supports('application/atom+xml')).toBe(true); + }); + + it('returns true for atom in content type', () => { + expect(parser.supports('application/atom')).toBe(true); + }); + + it('returns false for rss content type', () => { + expect(parser.supports('application/rss+xml')).toBe(false); + }); + + it('is case insensitive', () => { + expect(parser.supports('APPLICATION/ATOM+XML')).toBe(true); + }); + }); +}); diff --git a/modules/parser/atom.parser.ts b/modules/parser/atom.parser.ts new file mode 100644 index 0000000..8916fca --- /dev/null +++ b/modules/parser/atom.parser.ts @@ -0,0 +1,152 @@ +import type { FeedItem } from '../../interfaces/feed.types.js'; +import type { IParser } from '../../interfaces/parser.interface.js'; +import { XMLParser } from 'fast-xml-parser'; +import { generateId, parseDate, isValidXml } from './utils.js'; + +interface AtomLink { + '@_href'?: string; + '@_rel'?: string; +} + +interface AtomTextField { + '#text'?: string; + '@_type'?: string; +} + +interface AtomEntry { + title?: string | AtomTextField; + link?: AtomLink[] | AtomLink; + summary?: string | AtomTextField; + content?: string | AtomTextField; + published?: string; + updated?: string; + id?: string; +} + +interface AtomFeed { + feed?: { + entry?: AtomEntry[] | AtomEntry; + }; +} + +/** + * Parser for Atom 1.0 feeds. + */ +export class AtomParser implements IParser { + private xmlParser: XMLParser; + + constructor() { + this.xmlParser = new XMLParser({ + ignoreAttributes: false, + attributeNamePrefix: '@_', + parseAttributeValue: false, + trimValues: true, + }); + } + + async parse(xml: string, source: string): Promise { + if (!isValidXml(xml)) { + throw new Error('Invalid XML: does not appear to be valid Atom/XML'); + } + + let parsed: AtomFeed; + try { + parsed = this.xmlParser.parse(xml) as AtomFeed; + } catch (error) { + throw new Error(`XML parsing failed: ${(error as Error).message}`); + } + + if (!parsed.feed) { + throw new Error('Invalid Atom: missing root element'); + } + + const entries = parsed.feed.entry; + if (!entries) { + return []; + } + + const entryArray = Array.isArray(entries) ? entries : [entries]; + + return entryArray.map((entry) => this.parseEntry(entry, source)); + } + + private parseEntry(entry: AtomEntry, source: string): FeedItem { + if (!entry.title) { + throw new Error('Atom entry missing required field: title'); + } + + const url = this.extractUrl(entry); + if (!url) { + throw new Error('Atom entry missing required field: link with href'); + } + + const publishedAt = this.extractDate(entry); + + return { + id: generateId(url, publishedAt), + source, + title: this.cleanText(entry.title), + url, + publishedAt, + summary: entry.summary ? this.cleanText(entry.summary) : undefined, + content: entry.content ? this.cleanText(entry.content) : undefined, + }; + } + + private extractUrl(entry: AtomEntry): string | undefined { + if (!entry.link) { + return undefined; + } + + const links = Array.isArray(entry.link) ? entry.link : [entry.link]; + + // Prefer rel="alternate" (the actual article link) + const alternate = links.find(l => l['@_rel'] === 'alternate'); + if (alternate?.['@_href']) { + return alternate['@_href']; + } + + // Fallback to first link with href, skip rel="self" + const firstLink = links.find(l => l['@_href'] && l['@_rel'] !== 'self'); + if (firstLink?.['@_href']) { + return firstLink['@_href']; + } + + // Last resort: any href + const anyLink = links.find(l => l['@_href']); + return anyLink?.['@_href']; + } + + private extractDate(entry: AtomEntry): Date { + // Prefer , fallback to + if (entry.published) { + return parseDate(entry.published); + } + if (entry.updated) { + return parseDate(entry.updated); + } + return new Date(); + } + + private cleanText(text: string | AtomTextField | undefined): string { + if (!text) return ''; + // Handle object with #text property (when element has attributes like type="html") + const textValue = typeof text === 'string' ? text : text['#text'] || ''; + // Remove CDATA wrappers if present + return textValue + .replace(/^$/, '') + .trim(); + } + + private extractTitle(entry: AtomEntry): string { + if (!entry.title) { + throw new Error('Atom entry missing required field: title'); + } + return this.cleanText(entry.title); + } + + supports(contentType: string): boolean { + return contentType.toLowerCase().includes('atom'); + } +} diff --git a/modules/parser/index.ts b/modules/parser/index.ts new file mode 100644 index 0000000..4847fac --- /dev/null +++ b/modules/parser/index.ts @@ -0,0 +1,3 @@ +export { RssParser } from './rss.parser.js'; +export { AtomParser } from './atom.parser.js'; +export { generateId, parseDate } from './utils.js'; diff --git a/modules/parser/rss.parser.test.ts b/modules/parser/rss.parser.test.ts new file mode 100644 index 0000000..727f60e --- /dev/null +++ b/modules/parser/rss.parser.test.ts @@ -0,0 +1,192 @@ +import { describe, it, expect } from 'vitest'; +import { RssParser } from './rss.parser.js'; + +describe('RssParser', () => { + const parser = new RssParser(); + + describe('parse', () => { + it('parses valid RSS 2.0 feed with all fields', async () => { + const xml = ` + + + + Test Article + https://example.com/article + This is a summary + Full content

]]>
+ Mon, 06 Sep 2024 09:00:00 GMT +
+
+
`; + + const items = await parser.parse(xml, 'https://example.com/feed.xml'); + + expect(items).toHaveLength(1); + expect(items[0].title).toBe('Test Article'); + expect(items[0].url).toBe('https://example.com/article'); + expect(items[0].summary).toBe('This is a summary'); + expect(items[0].content).toBe('

Full content

'); + expect(items[0].publishedAt).toEqual(new Date('Mon, 06 Sep 2024 09:00:00 GMT')); + expect(items[0].source).toBe('https://example.com/feed.xml'); + expect(items[0].id).toBeDefined(); + }); + + it('parses RSS with only required fields', async () => { + const xml = ` + + + + Minimal Article + https://example.com/minimal + + + `; + + const items = await parser.parse(xml, 'https://example.com/feed.xml'); + + expect(items).toHaveLength(1); + expect(items[0].title).toBe('Minimal Article'); + expect(items[0].url).toBe('https://example.com/minimal'); + expect(items[0].summary).toBeUndefined(); + expect(items[0].content).toBeUndefined(); + expect(items[0].publishedAt).toBeInstanceOf(Date); + }); + + it('parses multiple items', async () => { + const xml = ` + + + + Article 1 + https://example.com/1 + Mon, 06 Sep 2024 09:00:00 GMT + + + Article 2 + https://example.com/2 + Tue, 07 Sep 2024 10:00:00 GMT + + + `; + + const items = await parser.parse(xml, 'https://example.com/feed.xml'); + + expect(items).toHaveLength(2); + expect(items[0].title).toBe('Article 1'); + expect(items[1].title).toBe('Article 2'); + }); + + it('returns empty array when no items', async () => { + const xml = ` + + + Empty Feed + + `; + + const items = await parser.parse(xml, 'https://example.com/feed.xml'); + + expect(items).toHaveLength(0); + }); + + it('throws on missing title', async () => { + const xml = ` + + + + https://example.com/article + + + `; + + await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow( + 'missing required field: title' + ); + }); + + it('throws on missing link', async () => { + const xml = ` + + + + Article Without Link + + + `; + + await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow( + 'missing required field: link' + ); + }); + + it('throws on invalid XML', async () => { + const xml = 'not xml at all'; + + await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow( + 'Invalid XML' + ); + }); + + it('throws on missing rss root element', async () => { + const xml = ''; + + await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow( + 'missing root element' + ); + }); + + it('parses ISO 8601 date as fallback', async () => { + const xml = ` + + + + Test + https://example.com/article + 2024-09-06T09:00:00Z + + + `; + + const items = await parser.parse(xml, 'https://example.com/feed.xml'); + + expect(items[0].publishedAt).toEqual(new Date('2024-09-06T09:00:00Z')); + }); + + it('generates deterministic IDs', async () => { + const xml = ` + + + + Test + https://example.com/article + Mon, 06 Sep 2024 09:00:00 GMT + + + `; + + const items1 = await parser.parse(xml, 'https://example.com/feed.xml'); + const items2 = await parser.parse(xml, 'https://example.com/feed.xml'); + + expect(items1[0].id).toBe(items2[0].id); + }); + }); + + describe('supports', () => { + it('returns true for application/rss+xml', () => { + expect(parser.supports('application/rss+xml')).toBe(true); + }); + + it('returns true for text/xml with rss', () => { + expect(parser.supports('text/xml')).toBe(false); // Not strictly RSS + expect(parser.supports('application/rss')).toBe(true); + }); + + it('returns false for atom content type', () => { + expect(parser.supports('application/atom+xml')).toBe(false); + }); + + it('is case insensitive', () => { + expect(parser.supports('APPLICATION/RSS+XML')).toBe(true); + }); + }); +}); diff --git a/modules/parser/rss.parser.ts b/modules/parser/rss.parser.ts new file mode 100644 index 0000000..e667531 --- /dev/null +++ b/modules/parser/rss.parser.ts @@ -0,0 +1,110 @@ +import type { FeedItem } from '../../interfaces/feed.types.js'; +import type { IParser } from '../../interfaces/parser.interface.js'; +import { XMLParser } from 'fast-xml-parser'; +import { generateId, parseDate, isValidXml } from './utils.js'; + +interface RssChannel { + title?: string; + link?: string; + description?: string; +} + +interface RssItem { + title?: string; + link?: string; + description?: string; + 'content:encoded'?: string; + pubDate?: string; + guid?: string; +} + +interface RssFeed { + rss?: { + channel?: { + item?: RssItem[] | RssItem; + }; + }; +} + +/** + * Parser for RSS 2.0 feeds. + */ +export class RssParser implements IParser { + private xmlParser: XMLParser; + + constructor() { + this.xmlParser = new XMLParser({ + ignoreAttributes: false, + attributeNamePrefix: '@_', + parseAttributeValue: false, + trimValues: true, + }); + } + + async parse(xml: string, source: string): Promise { + if (!isValidXml(xml)) { + throw new Error('Invalid XML: does not appear to be valid RSS/XML'); + } + + let parsed: RssFeed; + try { + parsed = this.xmlParser.parse(xml) as RssFeed; + } catch (error) { + throw new Error(`XML parsing failed: ${(error as Error).message}`); + } + + if (!parsed.rss) { + throw new Error('Invalid RSS: missing root element'); + } + + const channel = parsed.rss.channel; + if (!channel) { + throw new Error('Invalid RSS: missing element'); + } + + const items = channel.item; + if (!items) { + return []; + } + + const itemArray = Array.isArray(items) ? items : [items]; + + return itemArray.map((item) => this.parseItem(item, source)); + } + + private parseItem(item: RssItem, source: string): FeedItem { + if (!item.title) { + throw new Error('RSS item missing required field: title'); + } + + if (!item.link) { + throw new Error('RSS item missing required field: link'); + } + + const publishedAt = item.pubDate ? parseDate(item.pubDate) : new Date(); + const url = item.link; + + return { + id: generateId(url, publishedAt), + source, + title: this.cleanText(item.title), + url, + publishedAt, + summary: item.description ? this.cleanText(item.description) : undefined, + content: item['content:encoded'] ? this.cleanText(item['content:encoded']) : undefined, + }; + } + + private cleanText(text: string): string { + if (!text) return ''; + // Remove CDATA wrappers if present + return text + .replace(/^$/, '') + .trim(); + } + + supports(contentType: string): boolean { + return contentType.toLowerCase().includes('rss'); + } +} diff --git a/modules/parser/utils.ts b/modules/parser/utils.ts new file mode 100644 index 0000000..e971bd1 --- /dev/null +++ b/modules/parser/utils.ts @@ -0,0 +1,75 @@ +/** + * Utility functions for the parser module. + */ + +/** + * Generates a deterministic hash from URL and published date. + * Uses djb2 algorithm for fast, consistent hashing. + */ +export function generateId(url: string, publishedAt: Date): string { + const str = `${url}::${publishedAt.toISOString()}`; + let hash = 5381; + for (let i = 0; i < str.length; i++) { + hash = ((hash << 5) + hash) + str.charCodeAt(i); + } + return (hash >>> 0).toString(16); // Convert to unsigned and hex +} + +/** + * Parses dates from RSS (RFC 822) and Atom (ISO 8601) formats. + * Tries multiple formats for robustness. + * Throws if date cannot be parsed. + */ +export function parseDate(dateStr: string): Date { + if (!dateStr || typeof dateStr !== 'string') { + throw new Error(`Invalid date string: ${dateStr}`); + } + + const trimmed = dateStr.trim(); + + // Try native Date parsing first (handles ISO 8601 and many common formats) + let date = new Date(trimmed); + if (!isNaN(date.getTime())) { + return date; + } + + // Try RFC 822 format: Mon, 06 Sep 2024 09:00:00 GMT + const rfc822Match = trimmed.match(/^\w{3},?\s+(\d{1,2})\s+(\w{3})\s+(\d{4})\s+(\d{1,2}):(\d{2}):(\d{2})\s*(?:GMT|UTC|[+-]\d{4})?$/i); + if (rfc822Match) { + const months: { [key: string]: number } = { + jan: 0, feb: 1, mar: 2, apr: 3, may: 4, jun: 5, + jul: 6, aug: 7, sep: 8, oct: 9, nov: 10, dec: 11 + }; + const month = months[rfc822Match[2].toLowerCase()]; + if (month !== undefined) { + date = new Date( + parseInt(rfc822Match[3]), + month, + parseInt(rfc822Match[1]), + parseInt(rfc822Match[4]), + parseInt(rfc822Match[5]), + parseInt(rfc822Match[6]) + ); + if (!isNaN(date.getTime())) { + return date; + } + } + } + + throw new Error(`Unable to parse date: ${dateStr}`); +} + +/** + * Validates if a string is valid XML. + * Basic check for XML declaration or root element. + */ +export function isValidXml(xml: string): boolean { + if (!xml || typeof xml !== 'string') { + return false; + } + const trimmed = xml.trim(); + return trimmed.startsWith('=12.0.0" } }, + "node_modules/fast-xml-builder": { + "version": "1.1.8", + "resolved": "https://registry.npmjs.org/fast-xml-builder/-/fast-xml-builder-1.1.8.tgz", + "integrity": "sha512-sDVBc2gg8pSKvcbE8rBmOyjSGQf0AdsbqvHeIOv3D/uYNoV4eCReQXyDF8Pdv8+m1FHazACypSz2hR7O2S1LLw==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/NaturalIntelligence" + } + ], + "license": "MIT", + "dependencies": { + "path-expression-matcher": "^1.1.3" + } + }, + "node_modules/fast-xml-parser": { + "version": "5.7.3", + "resolved": "https://registry.npmjs.org/fast-xml-parser/-/fast-xml-parser-5.7.3.tgz", + "integrity": "sha512-C0AaNuC+mscy6vrAQKAc/rMq+zAPHodfHGZu4sGVehvAQt/JLG1O5zEcYcXSY5zSqr4YVgxsB+pHXTq0i7eDlg==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/NaturalIntelligence" + } + ], + "license": "MIT", + "dependencies": { + "@nodable/entities": "^2.1.0", + "fast-xml-builder": "^1.1.7", + "path-expression-matcher": "^1.5.0", + "strnum": "^2.2.3" + }, + "bin": { + "fxparser": "src/cli/cli.js" + } + }, "node_modules/fsevents": { "version": "2.3.3", "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", @@ -1161,6 +1210,21 @@ "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" } }, + "node_modules/path-expression-matcher": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/path-expression-matcher/-/path-expression-matcher-1.5.0.tgz", + "integrity": "sha512-cbrerZV+6rvdQrrD+iGMcZFEiiSrbv9Tfdkvnusy6y0x0GKBXREFg/Y65GhIfm0tnLntThhzCnfKwp1WRjeCyQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/NaturalIntelligence" + } + ], + "license": "MIT", + "engines": { + "node": ">=14.0.0" + } + }, "node_modules/pathe": { "version": "1.1.2", "resolved": "https://registry.npmjs.org/pathe/-/pathe-1.1.2.tgz", @@ -1300,6 +1364,18 @@ "dev": true, "license": "MIT" }, + "node_modules/strnum": { + "version": "2.2.3", + "resolved": "https://registry.npmjs.org/strnum/-/strnum-2.2.3.tgz", + "integrity": "sha512-oKx6RUCuHfT3oyVjtnrmn19H1SiCqgJSg+54XqURKp5aCMbrXrhLjRN9TjuwMjiYstZ0MzDrHqkGZ5dFTKd+zg==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/NaturalIntelligence" + } + ], + "license": "MIT" + }, "node_modules/tinybench": { "version": "2.9.0", "resolved": "https://registry.npmjs.org/tinybench/-/tinybench-2.9.0.tgz", diff --git a/package.json b/package.json index 42fd78a..d54be99 100644 --- a/package.json +++ b/package.json @@ -14,6 +14,7 @@ "vitest": "^2.1.0" }, "dependencies": { + "fast-xml-parser": "^5.7.3", "undici": "^6.21.0" } }