pulse/modules/parser/atom.parser.test.ts
Edo Limburg 2ea1f5cd95 Add parser module with RSS and Atom feed parsers
- Create separate RssParser and AtomParser implementing IParser interface
- Add utility functions for ID generation (djb2 hash) and date parsing
- Support both RSS (RFC 822) and Atom (ISO 8601) date formats
- Handle Atom elements with attributes (type="html") via #text property
- Map RSS <description> to summary and <content:encoded> to content
- Map Atom <summary> to summary and <content> to content
- Prefer Atom link[@rel="alternate"] for article URLs
- Throw descriptive errors for malformed XML and missing required fields
- Add comprehensive test coverage for both parsers (32 tests total)
2026-05-05 21:15:20 +02:00

237 lines
7.9 KiB
TypeScript

import { describe, it, expect } from 'vitest';
import { AtomParser } from './atom.parser.js';
describe('AtomParser', () => {
const parser = new AtomParser();
describe('parse', () => {
it('parses valid Atom feed with all fields', async () => {
const xml = `<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<entry>
<title>Test Article</title>
<link rel="alternate" href="https://example.com/article"/>
<summary type="html"><![CDATA[This is a summary]]></summary>
<content type="html"><![CDATA[<p>Full content</p>]]></content>
<published>2024-09-06T09:00:00Z</published>
</entry>
</feed>`;
const items = await parser.parse(xml, 'https://example.com/feed.xml');
expect(items).toHaveLength(1);
expect(items[0].title).toBe('Test Article');
expect(items[0].url).toBe('https://example.com/article');
expect(items[0].summary).toBe('This is a summary');
expect(items[0].content).toBe('<p>Full content</p>');
expect(items[0].publishedAt).toEqual(new Date('2024-09-06T09:00:00Z'));
expect(items[0].source).toBe('https://example.com/feed.xml');
expect(items[0].id).toBeDefined();
});
it('parses Atom with only required fields', async () => {
const xml = `<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<entry>
<title>Minimal Article</title>
<link href="https://example.com/minimal"/>
</entry>
</feed>`;
const items = await parser.parse(xml, 'https://example.com/feed.xml');
expect(items).toHaveLength(1);
expect(items[0].title).toBe('Minimal Article');
expect(items[0].url).toBe('https://example.com/minimal');
expect(items[0].summary).toBeUndefined();
expect(items[0].content).toBeUndefined();
expect(items[0].publishedAt).toBeInstanceOf(Date);
});
it('parses multiple entries', async () => {
const xml = `<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<entry>
<title>Article 1</title>
<link href="https://example.com/1"/>
<published>2024-09-06T09:00:00Z</published>
</entry>
<entry>
<title>Article 2</title>
<link href="https://example.com/2"/>
<published>2024-09-07T10:00:00Z</published>
</entry>
</feed>`;
const items = await parser.parse(xml, 'https://example.com/feed.xml');
expect(items).toHaveLength(2);
expect(items[0].title).toBe('Article 1');
expect(items[1].title).toBe('Article 2');
});
it('returns empty array when no entries', async () => {
const xml = `<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Empty Feed</title>
</feed>`;
const items = await parser.parse(xml, 'https://example.com/feed.xml');
expect(items).toHaveLength(0);
});
it('prefers rel="alternate" link', async () => {
const xml = `<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<entry>
<title>Test</title>
<link rel="self" href="https://example.com/feed"/>
<link rel="alternate" href="https://example.com/article"/>
</entry>
</feed>`;
const items = await parser.parse(xml, 'https://example.com/feed.xml');
expect(items[0].url).toBe('https://example.com/article');
});
it('falls back to first non-self link', async () => {
const xml = `<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<entry>
<title>Test</title>
<link rel="self" href="https://example.com/feed"/>
<link href="https://example.com/article"/>
</entry>
</feed>`;
const items = await parser.parse(xml, 'https://example.com/feed.xml');
expect(items[0].url).toBe('https://example.com/article');
});
it('throws on missing title', async () => {
const xml = `<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<entry>
<link href="https://example.com/article"/>
</entry>
</feed>`;
await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
'missing required field: title'
);
});
it('throws on missing link with href', async () => {
const xml = `<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<entry>
<title>Article Without Link</title>
</entry>
</feed>`;
await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
'missing required field: link with href'
);
});
it('throws on invalid XML', async () => {
const xml = 'not xml at all';
await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
'Invalid XML'
);
});
it('throws on missing feed root element', async () => {
const xml = '<?xml version="1.0"?><rss></rss>';
await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
'missing <feed> root element'
);
});
it('uses <updated> when <published> is missing', async () => {
const xml = `<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<entry>
<title>Test</title>
<link href="https://example.com/article"/>
<updated>2024-09-06T09:00:00Z</updated>
</entry>
</feed>`;
const items = await parser.parse(xml, 'https://example.com/feed.xml');
expect(items[0].publishedAt).toEqual(new Date('2024-09-06T09:00:00Z'));
});
it('prefers <published> over <updated>', async () => {
const xml = `<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<entry>
<title>Test</title>
<link href="https://example.com/article"/>
<published>2024-09-06T09:00:00Z</published>
<updated>2024-09-07T10:00:00Z</updated>
</entry>
</feed>`;
const items = await parser.parse(xml, 'https://example.com/feed.xml');
expect(items[0].publishedAt).toEqual(new Date('2024-09-06T09:00:00Z'));
});
it('generates deterministic IDs', async () => {
const xml = `<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<entry>
<title>Test</title>
<link href="https://example.com/article"/>
<published>2024-09-06T09:00:00Z</published>
</entry>
</feed>`;
const items1 = await parser.parse(xml, 'https://example.com/feed.xml');
const items2 = await parser.parse(xml, 'https://example.com/feed.xml');
expect(items1[0].id).toBe(items2[0].id);
});
it('handles multiple links in array format', async () => {
const xml = `<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<entry>
<title>Test</title>
<link rel="self" href="https://example.com/feed"/>
<link rel="alternate" href="https://example.com/article"/>
</entry>
</feed>`;
const items = await parser.parse(xml, 'https://example.com/feed.xml');
expect(items[0].url).toBe('https://example.com/article');
});
});
describe('supports', () => {
it('returns true for application/atom+xml', () => {
expect(parser.supports('application/atom+xml')).toBe(true);
});
it('returns true for atom in content type', () => {
expect(parser.supports('application/atom')).toBe(true);
});
it('returns false for rss content type', () => {
expect(parser.supports('application/rss+xml')).toBe(false);
});
it('is case insensitive', () => {
expect(parser.supports('APPLICATION/ATOM+XML')).toBe(true);
});
});
});