pulse/modules/parser/rss.parser.test.ts
Edo Limburg 2ea1f5cd95 Add parser module with RSS and Atom feed parsers
- Create separate RssParser and AtomParser implementing IParser interface
- Add utility functions for ID generation (djb2 hash) and date parsing
- Support both RSS (RFC 822) and Atom (ISO 8601) date formats
- Handle Atom elements with attributes (type="html") via #text property
- Map RSS <description> to summary and <content:encoded> to content
- Map Atom <summary> to summary and <content> to content
- Prefer Atom link[@rel="alternate"] for article URLs
- Throw descriptive errors for malformed XML and missing required fields
- Add comprehensive test coverage for both parsers (32 tests total)
2026-05-05 21:15:20 +02:00

193 lines
6.0 KiB
TypeScript

import { describe, it, expect } from 'vitest';
import { RssParser } from './rss.parser.js';
describe('RssParser', () => {
const parser = new RssParser();
describe('parse', () => {
it('parses valid RSS 2.0 feed with all fields', async () => {
const xml = `<?xml version="1.0"?>
<rss version="2.0">
<channel>
<item>
<title>Test Article</title>
<link>https://example.com/article</link>
<description>This is a summary</description>
<content:encoded><![CDATA[<p>Full content</p>]]></content:encoded>
<pubDate>Mon, 06 Sep 2024 09:00:00 GMT</pubDate>
</item>
</channel>
</rss>`;
const items = await parser.parse(xml, 'https://example.com/feed.xml');
expect(items).toHaveLength(1);
expect(items[0].title).toBe('Test Article');
expect(items[0].url).toBe('https://example.com/article');
expect(items[0].summary).toBe('This is a summary');
expect(items[0].content).toBe('<p>Full content</p>');
expect(items[0].publishedAt).toEqual(new Date('Mon, 06 Sep 2024 09:00:00 GMT'));
expect(items[0].source).toBe('https://example.com/feed.xml');
expect(items[0].id).toBeDefined();
});
it('parses RSS with only required fields', async () => {
const xml = `<?xml version="1.0"?>
<rss version="2.0">
<channel>
<item>
<title>Minimal Article</title>
<link>https://example.com/minimal</link>
</item>
</channel>
</rss>`;
const items = await parser.parse(xml, 'https://example.com/feed.xml');
expect(items).toHaveLength(1);
expect(items[0].title).toBe('Minimal Article');
expect(items[0].url).toBe('https://example.com/minimal');
expect(items[0].summary).toBeUndefined();
expect(items[0].content).toBeUndefined();
expect(items[0].publishedAt).toBeInstanceOf(Date);
});
it('parses multiple items', async () => {
const xml = `<?xml version="1.0"?>
<rss version="2.0">
<channel>
<item>
<title>Article 1</title>
<link>https://example.com/1</link>
<pubDate>Mon, 06 Sep 2024 09:00:00 GMT</pubDate>
</item>
<item>
<title>Article 2</title>
<link>https://example.com/2</link>
<pubDate>Tue, 07 Sep 2024 10:00:00 GMT</pubDate>
</item>
</channel>
</rss>`;
const items = await parser.parse(xml, 'https://example.com/feed.xml');
expect(items).toHaveLength(2);
expect(items[0].title).toBe('Article 1');
expect(items[1].title).toBe('Article 2');
});
it('returns empty array when no items', async () => {
const xml = `<?xml version="1.0"?>
<rss version="2.0">
<channel>
<title>Empty Feed</title>
</channel>
</rss>`;
const items = await parser.parse(xml, 'https://example.com/feed.xml');
expect(items).toHaveLength(0);
});
it('throws on missing title', async () => {
const xml = `<?xml version="1.0"?>
<rss version="2.0">
<channel>
<item>
<link>https://example.com/article</link>
</item>
</channel>
</rss>`;
await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
'missing required field: title'
);
});
it('throws on missing link', async () => {
const xml = `<?xml version="1.0"?>
<rss version="2.0">
<channel>
<item>
<title>Article Without Link</title>
</item>
</channel>
</rss>`;
await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
'missing required field: link'
);
});
it('throws on invalid XML', async () => {
const xml = 'not xml at all';
await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
'Invalid XML'
);
});
it('throws on missing rss root element', async () => {
const xml = '<?xml version="1.0"?><feed></feed>';
await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
'missing <rss> root element'
);
});
it('parses ISO 8601 date as fallback', async () => {
const xml = `<?xml version="1.0"?>
<rss version="2.0">
<channel>
<item>
<title>Test</title>
<link>https://example.com/article</link>
<pubDate>2024-09-06T09:00:00Z</pubDate>
</item>
</channel>
</rss>`;
const items = await parser.parse(xml, 'https://example.com/feed.xml');
expect(items[0].publishedAt).toEqual(new Date('2024-09-06T09:00:00Z'));
});
it('generates deterministic IDs', async () => {
const xml = `<?xml version="1.0"?>
<rss version="2.0">
<channel>
<item>
<title>Test</title>
<link>https://example.com/article</link>
<pubDate>Mon, 06 Sep 2024 09:00:00 GMT</pubDate>
</item>
</channel>
</rss>`;
const items1 = await parser.parse(xml, 'https://example.com/feed.xml');
const items2 = await parser.parse(xml, 'https://example.com/feed.xml');
expect(items1[0].id).toBe(items2[0].id);
});
});
describe('supports', () => {
it('returns true for application/rss+xml', () => {
expect(parser.supports('application/rss+xml')).toBe(true);
});
it('returns true for text/xml with rss', () => {
expect(parser.supports('text/xml')).toBe(false); // Not strictly RSS
expect(parser.supports('application/rss')).toBe(true);
});
it('returns false for atom content type', () => {
expect(parser.supports('application/atom+xml')).toBe(false);
});
it('is case insensitive', () => {
expect(parser.supports('APPLICATION/RSS+XML')).toBe(true);
});
});
});