pulse/modules/parser/rss.parser.test.ts
Edo Limburg c79eb6d76d Add CLI entry point, RSS content extraction, and image support
Features:
- Add CLI with commands: start, add, remove, list, fetch, status, items
- Auto-detect RSS format when adding feeds
- Auto-run database migrations on startup
- Extract full HTML content from RSS description field (NOS-style feeds)
- Extract image URLs from RSS enclosure tags
- Display images in terminal output with emoji
- Include imageUrl in JSON formatter output

Database:
- Add image_url column to feed_items table
- Update storage layer to persist imageUrl field

Tests:
- Add 10 CLI integration tests
- Add 3 RSS parser tests for image/content extraction
- Add 2 storage tests for imageUrl persistence

Dependencies:
- Add commander for CLI framework

All 144 tests passing
2026-05-05 23:05:30 +02:00

326 lines
12 KiB
TypeScript

import { describe, it, expect } from 'vitest';
import { RssParser } from './rss.parser.js';
describe('RssParser', () => {
const parser = new RssParser();
describe('parse', () => {
it('parses valid RSS 2.0 feed with all fields', async () => {
const xml = `<?xml version="1.0"?>
<rss version="2.0">
<channel>
<item>
<title>Test Article</title>
<link>https://example.com/article</link>
<description>This is a summary</description>
<content:encoded><![CDATA[<p>Full content</p>]]></content:encoded>
<pubDate>Mon, 06 Sep 2024 09:00:00 GMT</pubDate>
</item>
</channel>
</rss>`;
const items = await parser.parse(xml, 'https://example.com/feed.xml');
expect(items).toHaveLength(1);
expect(items[0].title).toBe('Test Article');
expect(items[0].url).toBe('https://example.com/article');
expect(items[0].summary).toBe('This is a summary');
expect(items[0].content).toBe('<p>Full content</p>');
expect(items[0].publishedAt).toEqual(new Date('Mon, 06 Sep 2024 09:00:00 GMT'));
expect(items[0].source).toBe('https://example.com/feed.xml');
expect(items[0].id).toBeDefined();
});
it('parses RSS with only required fields', async () => {
const xml = `<?xml version="1.0"?>
<rss version="2.0">
<channel>
<item>
<title>Minimal Article</title>
<link>https://example.com/minimal</link>
</item>
</channel>
</rss>`;
const items = await parser.parse(xml, 'https://example.com/feed.xml');
expect(items).toHaveLength(1);
expect(items[0].title).toBe('Minimal Article');
expect(items[0].url).toBe('https://example.com/minimal');
expect(items[0].summary).toBeUndefined();
expect(items[0].content).toBeUndefined();
expect(items[0].publishedAt).toBeInstanceOf(Date);
});
it('parses multiple items', async () => {
const xml = `<?xml version="1.0"?>
<rss version="2.0">
<channel>
<item>
<title>Article 1</title>
<link>https://example.com/1</link>
<pubDate>Mon, 06 Sep 2024 09:00:00 GMT</pubDate>
</item>
<item>
<title>Article 2</title>
<link>https://example.com/2</link>
<pubDate>Tue, 07 Sep 2024 10:00:00 GMT</pubDate>
</item>
</channel>
</rss>`;
const items = await parser.parse(xml, 'https://example.com/feed.xml');
expect(items).toHaveLength(2);
expect(items[0].title).toBe('Article 1');
expect(items[1].title).toBe('Article 2');
});
it('returns empty array when no items', async () => {
const xml = `<?xml version="1.0"?>
<rss version="2.0">
<channel>
<title>Empty Feed</title>
</channel>
</rss>`;
const items = await parser.parse(xml, 'https://example.com/feed.xml');
expect(items).toHaveLength(0);
});
it('throws on missing title', async () => {
const xml = `<?xml version="1.0"?>
<rss version="2.0">
<channel>
<item>
<link>https://example.com/article</link>
</item>
</channel>
</rss>`;
await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
'missing required field: title'
);
});
it('throws on missing link', async () => {
const xml = `<?xml version="1.0"?>
<rss version="2.0">
<channel>
<item>
<title>Article Without Link</title>
</item>
</channel>
</rss>`;
await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
'missing required field: link'
);
});
it('throws on invalid XML', async () => {
const xml = 'not xml at all';
await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
'Invalid XML'
);
});
it('throws on missing rss root element', async () => {
const xml = '<?xml version="1.0"?><feed></feed>';
await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
'missing <rss> root element'
);
});
it('parses ISO 8601 date as fallback', async () => {
const xml = `<?xml version="1.0"?>
<rss version="2.0">
<channel>
<item>
<title>Test</title>
<link>https://example.com/article</link>
<pubDate>2024-09-06T09:00:00Z</pubDate>
</item>
</channel>
</rss>`;
const items = await parser.parse(xml, 'https://example.com/feed.xml');
expect(items[0].publishedAt).toEqual(new Date('2024-09-06T09:00:00Z'));
});
it('generates deterministic IDs', async () => {
const xml = `<?xml version="1.0"?>
<rss version="2.0">
<channel>
<item>
<title>Test</title>
<link>https://example.com/article</link>
<pubDate>Mon, 06 Sep 2024 09:00:00 GMT</pubDate>
</item>
</channel>
</rss>`;
const items1 = await parser.parse(xml, 'https://example.com/feed.xml');
const items2 = await parser.parse(xml, 'https://example.com/feed.xml');
expect(items1[0].id).toBe(items2[0].id);
});
it('uses description as content when no content:encoded and description contains HTML', async () => {
// Simulates feeds like NOS that put full HTML content in description
const xml = `<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
<channel>
<item>
<title><![CDATA[Iran ontkent aanvallen VAE]]></title>
<link>https://nos.nl/l/2613264</link>
<description><![CDATA[
<p>Iran ontkent aanvallen te hebben uitgevoerd op de Verenigde Arabische Emiraten.</p>
<p>Gisteren werden er ook al aanvallen gemeld door de VAE.</p>
<h2>Onderhandelingen onmogelijk</h2>
<p>Iraanse staatsmedia melden dat de Iraanse president Pezeshkian heeft gezegd dat de VS aan de ene kant de druk op Iran opvoert.</p>
]]></description>
<pubDate>Tue, 5 May 2026 21:44:46 +0200</pubDate>
</item>
</channel>
</rss>`;
const items = await parser.parse(xml, 'https://feeds.nos.nl/nosnieuwsalgemeen');
expect(items).toHaveLength(1);
expect(items[0].title).toBe('Iran ontkent aanvallen VAE');
// Content should contain the full HTML
expect(items[0].content).toContain('<p>Iran ontkent aanvallen');
expect(items[0].content).toContain('<h2>Onderhandelingen onmogelijk</h2>');
// Summary should be extracted from content
expect(items[0].summary).toBeDefined();
expect(items[0].summary).toContain('Iran ontkent aanvallen');
expect(items[0].summary?.length).toBeLessThanOrEqual(210); // 200 + "..."
});
it('uses description as summary when it looks like plain text summary', async () => {
const xml = `<?xml version="1.0"?>
<rss version="2.0">
<channel>
<item>
<title>Short Summary Article</title>
<link>https://example.com/article</link>
<description>This is just a brief summary without HTML tags</description>
<pubDate>Mon, 06 Sep 2024 09:00:00 GMT</pubDate>
</item>
</channel>
</rss>`;
const items = await parser.parse(xml, 'https://example.com/feed.xml');
expect(items).toHaveLength(1);
expect(items[0].summary).toBe('This is just a brief summary without HTML tags');
expect(items[0].content).toBeUndefined();
});
it('strips CDATA wrappers from description and content', async () => {
const xml = `<?xml version="1.0"?>
<rss version="2.0">
<channel>
<item>
<title><![CDATA[CDATA Title]]></title>
<link>https://example.com/article</link>
<description><![CDATA[<p>This is a very long content with <strong>formatting</strong> and lots of text to ensure it exceeds the 500 character threshold for being considered full content. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.</p>]]></description>
<pubDate>Mon, 06 Sep 2024 09:00:00 GMT</pubDate>
</item>
</channel>
</rss>`;
const items = await parser.parse(xml, 'https://example.com/feed.xml');
expect(items[0].title).toBe('CDATA Title');
expect(items[0].content).toContain('<p>This is a very long content');
expect(items[0].content).toContain('<strong>formatting</strong>');
});
it('extracts image URL from enclosure', async () => {
const xml = `<?xml version="1.0"?>
<rss version="2.0">
<channel>
<item>
<title>Article with Image</title>
<link>https://example.com/article</link>
<description>Article summary</description>
<enclosure url="https://example.com/image.jpg" type="image/jpeg" length="12345"/>
<pubDate>Mon, 06 Sep 2024 09:00:00 GMT</pubDate>
</item>
</channel>
</rss>`;
const items = await parser.parse(xml, 'https://example.com/feed.xml');
expect(items).toHaveLength(1);
expect(items[0].imageUrl).toBe('https://example.com/image.jpg');
});
it('extracts first image from multiple enclosures', async () => {
const xml = `<?xml version="1.0"?>
<rss version="2.0">
<channel>
<item>
<title>Article with Multiple Enclosures</title>
<link>https://example.com/article</link>
<description>Article summary</description>
<enclosure url="https://example.com/audio.mp3" type="audio/mpeg" length="12345"/>
<enclosure url="https://example.com/image.webp" type="image/webp" length="67890"/>
<enclosure url="https://example.com/video.mp4" type="video/mp4" length="99999"/>
<pubDate>Mon, 06 Sep 2024 09:00:00 GMT</pubDate>
</item>
</channel>
</rss>`;
const items = await parser.parse(xml, 'https://example.com/feed.xml');
expect(items).toHaveLength(1);
expect(items[0].imageUrl).toBe('https://example.com/image.webp');
});
it('handles items without enclosure', async () => {
const xml = `<?xml version="1.0"?>
<rss version="2.0">
<channel>
<item>
<title>Article without Image</title>
<link>https://example.com/article</link>
<description>Article summary</description>
<pubDate>Mon, 06 Sep 2024 09:00:00 GMT</pubDate>
</item>
</channel>
</rss>`;
const items = await parser.parse(xml, 'https://example.com/feed.xml');
expect(items).toHaveLength(1);
expect(items[0].imageUrl).toBeUndefined();
});
});
describe('supports', () => {
it('returns true for application/rss+xml', () => {
expect(parser.supports('application/rss+xml')).toBe(true);
});
it('returns true for text/xml with rss', () => {
expect(parser.supports('text/xml')).toBe(false); // Not strictly RSS
expect(parser.supports('application/rss')).toBe(true);
});
it('returns false for atom content type', () => {
expect(parser.supports('application/atom+xml')).toBe(false);
});
it('is case insensitive', () => {
expect(parser.supports('APPLICATION/RSS+XML')).toBe(true);
});
});
});