Add parser module with RSS and Atom feed parsers
- Create separate RssParser and AtomParser implementing IParser interface - Add utility functions for ID generation (djb2 hash) and date parsing - Support both RSS (RFC 822) and Atom (ISO 8601) date formats - Handle Atom elements with attributes (type="html") via #text property - Map RSS <description> to summary and <content:encoded> to content - Map Atom <summary> to summary and <content> to content - Prefer Atom link[@rel="alternate"] for article URLs - Throw descriptive errors for malformed XML and missing required fields - Add comprehensive test coverage for both parsers (32 tests total)
This commit is contained in:
parent
84cafa9ecb
commit
2ea1f5cd95
236
modules/parser/atom.parser.test.ts
Normal file
236
modules/parser/atom.parser.test.ts
Normal file
@ -0,0 +1,236 @@
|
|||||||
|
import { describe, it, expect } from 'vitest';
|
||||||
|
import { AtomParser } from './atom.parser.js';
|
||||||
|
|
||||||
|
describe('AtomParser', () => {
|
||||||
|
const parser = new AtomParser();
|
||||||
|
|
||||||
|
describe('parse', () => {
|
||||||
|
it('parses valid Atom feed with all fields', async () => {
|
||||||
|
const xml = `<?xml version="1.0"?>
|
||||||
|
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||||
|
<entry>
|
||||||
|
<title>Test Article</title>
|
||||||
|
<link rel="alternate" href="https://example.com/article"/>
|
||||||
|
<summary type="html"><![CDATA[This is a summary]]></summary>
|
||||||
|
<content type="html"><![CDATA[<p>Full content</p>]]></content>
|
||||||
|
<published>2024-09-06T09:00:00Z</published>
|
||||||
|
</entry>
|
||||||
|
</feed>`;
|
||||||
|
|
||||||
|
const items = await parser.parse(xml, 'https://example.com/feed.xml');
|
||||||
|
|
||||||
|
expect(items).toHaveLength(1);
|
||||||
|
expect(items[0].title).toBe('Test Article');
|
||||||
|
expect(items[0].url).toBe('https://example.com/article');
|
||||||
|
expect(items[0].summary).toBe('This is a summary');
|
||||||
|
expect(items[0].content).toBe('<p>Full content</p>');
|
||||||
|
expect(items[0].publishedAt).toEqual(new Date('2024-09-06T09:00:00Z'));
|
||||||
|
expect(items[0].source).toBe('https://example.com/feed.xml');
|
||||||
|
expect(items[0].id).toBeDefined();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('parses Atom with only required fields', async () => {
|
||||||
|
const xml = `<?xml version="1.0"?>
|
||||||
|
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||||
|
<entry>
|
||||||
|
<title>Minimal Article</title>
|
||||||
|
<link href="https://example.com/minimal"/>
|
||||||
|
</entry>
|
||||||
|
</feed>`;
|
||||||
|
|
||||||
|
const items = await parser.parse(xml, 'https://example.com/feed.xml');
|
||||||
|
|
||||||
|
expect(items).toHaveLength(1);
|
||||||
|
expect(items[0].title).toBe('Minimal Article');
|
||||||
|
expect(items[0].url).toBe('https://example.com/minimal');
|
||||||
|
expect(items[0].summary).toBeUndefined();
|
||||||
|
expect(items[0].content).toBeUndefined();
|
||||||
|
expect(items[0].publishedAt).toBeInstanceOf(Date);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('parses multiple entries', async () => {
|
||||||
|
const xml = `<?xml version="1.0"?>
|
||||||
|
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||||
|
<entry>
|
||||||
|
<title>Article 1</title>
|
||||||
|
<link href="https://example.com/1"/>
|
||||||
|
<published>2024-09-06T09:00:00Z</published>
|
||||||
|
</entry>
|
||||||
|
<entry>
|
||||||
|
<title>Article 2</title>
|
||||||
|
<link href="https://example.com/2"/>
|
||||||
|
<published>2024-09-07T10:00:00Z</published>
|
||||||
|
</entry>
|
||||||
|
</feed>`;
|
||||||
|
|
||||||
|
const items = await parser.parse(xml, 'https://example.com/feed.xml');
|
||||||
|
|
||||||
|
expect(items).toHaveLength(2);
|
||||||
|
expect(items[0].title).toBe('Article 1');
|
||||||
|
expect(items[1].title).toBe('Article 2');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns empty array when no entries', async () => {
|
||||||
|
const xml = `<?xml version="1.0"?>
|
||||||
|
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||||
|
<title>Empty Feed</title>
|
||||||
|
</feed>`;
|
||||||
|
|
||||||
|
const items = await parser.parse(xml, 'https://example.com/feed.xml');
|
||||||
|
|
||||||
|
expect(items).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('prefers rel="alternate" link', async () => {
|
||||||
|
const xml = `<?xml version="1.0"?>
|
||||||
|
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||||
|
<entry>
|
||||||
|
<title>Test</title>
|
||||||
|
<link rel="self" href="https://example.com/feed"/>
|
||||||
|
<link rel="alternate" href="https://example.com/article"/>
|
||||||
|
</entry>
|
||||||
|
</feed>`;
|
||||||
|
|
||||||
|
const items = await parser.parse(xml, 'https://example.com/feed.xml');
|
||||||
|
|
||||||
|
expect(items[0].url).toBe('https://example.com/article');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('falls back to first non-self link', async () => {
|
||||||
|
const xml = `<?xml version="1.0"?>
|
||||||
|
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||||
|
<entry>
|
||||||
|
<title>Test</title>
|
||||||
|
<link rel="self" href="https://example.com/feed"/>
|
||||||
|
<link href="https://example.com/article"/>
|
||||||
|
</entry>
|
||||||
|
</feed>`;
|
||||||
|
|
||||||
|
const items = await parser.parse(xml, 'https://example.com/feed.xml');
|
||||||
|
|
||||||
|
expect(items[0].url).toBe('https://example.com/article');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('throws on missing title', async () => {
|
||||||
|
const xml = `<?xml version="1.0"?>
|
||||||
|
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||||
|
<entry>
|
||||||
|
<link href="https://example.com/article"/>
|
||||||
|
</entry>
|
||||||
|
</feed>`;
|
||||||
|
|
||||||
|
await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
|
||||||
|
'missing required field: title'
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('throws on missing link with href', async () => {
|
||||||
|
const xml = `<?xml version="1.0"?>
|
||||||
|
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||||
|
<entry>
|
||||||
|
<title>Article Without Link</title>
|
||||||
|
</entry>
|
||||||
|
</feed>`;
|
||||||
|
|
||||||
|
await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
|
||||||
|
'missing required field: link with href'
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('throws on invalid XML', async () => {
|
||||||
|
const xml = 'not xml at all';
|
||||||
|
|
||||||
|
await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
|
||||||
|
'Invalid XML'
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('throws on missing feed root element', async () => {
|
||||||
|
const xml = '<?xml version="1.0"?><rss></rss>';
|
||||||
|
|
||||||
|
await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
|
||||||
|
'missing <feed> root element'
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('uses <updated> when <published> is missing', async () => {
|
||||||
|
const xml = `<?xml version="1.0"?>
|
||||||
|
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||||
|
<entry>
|
||||||
|
<title>Test</title>
|
||||||
|
<link href="https://example.com/article"/>
|
||||||
|
<updated>2024-09-06T09:00:00Z</updated>
|
||||||
|
</entry>
|
||||||
|
</feed>`;
|
||||||
|
|
||||||
|
const items = await parser.parse(xml, 'https://example.com/feed.xml');
|
||||||
|
|
||||||
|
expect(items[0].publishedAt).toEqual(new Date('2024-09-06T09:00:00Z'));
|
||||||
|
});
|
||||||
|
|
||||||
|
it('prefers <published> over <updated>', async () => {
|
||||||
|
const xml = `<?xml version="1.0"?>
|
||||||
|
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||||
|
<entry>
|
||||||
|
<title>Test</title>
|
||||||
|
<link href="https://example.com/article"/>
|
||||||
|
<published>2024-09-06T09:00:00Z</published>
|
||||||
|
<updated>2024-09-07T10:00:00Z</updated>
|
||||||
|
</entry>
|
||||||
|
</feed>`;
|
||||||
|
|
||||||
|
const items = await parser.parse(xml, 'https://example.com/feed.xml');
|
||||||
|
|
||||||
|
expect(items[0].publishedAt).toEqual(new Date('2024-09-06T09:00:00Z'));
|
||||||
|
});
|
||||||
|
|
||||||
|
it('generates deterministic IDs', async () => {
|
||||||
|
const xml = `<?xml version="1.0"?>
|
||||||
|
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||||
|
<entry>
|
||||||
|
<title>Test</title>
|
||||||
|
<link href="https://example.com/article"/>
|
||||||
|
<published>2024-09-06T09:00:00Z</published>
|
||||||
|
</entry>
|
||||||
|
</feed>`;
|
||||||
|
|
||||||
|
const items1 = await parser.parse(xml, 'https://example.com/feed.xml');
|
||||||
|
const items2 = await parser.parse(xml, 'https://example.com/feed.xml');
|
||||||
|
|
||||||
|
expect(items1[0].id).toBe(items2[0].id);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('handles multiple links in array format', async () => {
|
||||||
|
const xml = `<?xml version="1.0"?>
|
||||||
|
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||||
|
<entry>
|
||||||
|
<title>Test</title>
|
||||||
|
<link rel="self" href="https://example.com/feed"/>
|
||||||
|
<link rel="alternate" href="https://example.com/article"/>
|
||||||
|
</entry>
|
||||||
|
</feed>`;
|
||||||
|
|
||||||
|
const items = await parser.parse(xml, 'https://example.com/feed.xml');
|
||||||
|
|
||||||
|
expect(items[0].url).toBe('https://example.com/article');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('supports', () => {
|
||||||
|
it('returns true for application/atom+xml', () => {
|
||||||
|
expect(parser.supports('application/atom+xml')).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns true for atom in content type', () => {
|
||||||
|
expect(parser.supports('application/atom')).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns false for rss content type', () => {
|
||||||
|
expect(parser.supports('application/rss+xml')).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('is case insensitive', () => {
|
||||||
|
expect(parser.supports('APPLICATION/ATOM+XML')).toBe(true);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
152
modules/parser/atom.parser.ts
Normal file
152
modules/parser/atom.parser.ts
Normal file
@ -0,0 +1,152 @@
|
|||||||
|
import type { FeedItem } from '../../interfaces/feed.types.js';
|
||||||
|
import type { IParser } from '../../interfaces/parser.interface.js';
|
||||||
|
import { XMLParser } from 'fast-xml-parser';
|
||||||
|
import { generateId, parseDate, isValidXml } from './utils.js';
|
||||||
|
|
||||||
|
interface AtomLink {
|
||||||
|
'@_href'?: string;
|
||||||
|
'@_rel'?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface AtomTextField {
|
||||||
|
'#text'?: string;
|
||||||
|
'@_type'?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface AtomEntry {
|
||||||
|
title?: string | AtomTextField;
|
||||||
|
link?: AtomLink[] | AtomLink;
|
||||||
|
summary?: string | AtomTextField;
|
||||||
|
content?: string | AtomTextField;
|
||||||
|
published?: string;
|
||||||
|
updated?: string;
|
||||||
|
id?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface AtomFeed {
|
||||||
|
feed?: {
|
||||||
|
entry?: AtomEntry[] | AtomEntry;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parser for Atom 1.0 feeds.
|
||||||
|
*/
|
||||||
|
export class AtomParser implements IParser {
|
||||||
|
private xmlParser: XMLParser;
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
this.xmlParser = new XMLParser({
|
||||||
|
ignoreAttributes: false,
|
||||||
|
attributeNamePrefix: '@_',
|
||||||
|
parseAttributeValue: false,
|
||||||
|
trimValues: true,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async parse(xml: string, source: string): Promise<FeedItem[]> {
|
||||||
|
if (!isValidXml(xml)) {
|
||||||
|
throw new Error('Invalid XML: does not appear to be valid Atom/XML');
|
||||||
|
}
|
||||||
|
|
||||||
|
let parsed: AtomFeed;
|
||||||
|
try {
|
||||||
|
parsed = this.xmlParser.parse(xml) as AtomFeed;
|
||||||
|
} catch (error) {
|
||||||
|
throw new Error(`XML parsing failed: ${(error as Error).message}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!parsed.feed) {
|
||||||
|
throw new Error('Invalid Atom: missing <feed> root element');
|
||||||
|
}
|
||||||
|
|
||||||
|
const entries = parsed.feed.entry;
|
||||||
|
if (!entries) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const entryArray = Array.isArray(entries) ? entries : [entries];
|
||||||
|
|
||||||
|
return entryArray.map((entry) => this.parseEntry(entry, source));
|
||||||
|
}
|
||||||
|
|
||||||
|
private parseEntry(entry: AtomEntry, source: string): FeedItem {
|
||||||
|
if (!entry.title) {
|
||||||
|
throw new Error('Atom entry missing required field: title');
|
||||||
|
}
|
||||||
|
|
||||||
|
const url = this.extractUrl(entry);
|
||||||
|
if (!url) {
|
||||||
|
throw new Error('Atom entry missing required field: link with href');
|
||||||
|
}
|
||||||
|
|
||||||
|
const publishedAt = this.extractDate(entry);
|
||||||
|
|
||||||
|
return {
|
||||||
|
id: generateId(url, publishedAt),
|
||||||
|
source,
|
||||||
|
title: this.cleanText(entry.title),
|
||||||
|
url,
|
||||||
|
publishedAt,
|
||||||
|
summary: entry.summary ? this.cleanText(entry.summary) : undefined,
|
||||||
|
content: entry.content ? this.cleanText(entry.content) : undefined,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private extractUrl(entry: AtomEntry): string | undefined {
|
||||||
|
if (!entry.link) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
const links = Array.isArray(entry.link) ? entry.link : [entry.link];
|
||||||
|
|
||||||
|
// Prefer rel="alternate" (the actual article link)
|
||||||
|
const alternate = links.find(l => l['@_rel'] === 'alternate');
|
||||||
|
if (alternate?.['@_href']) {
|
||||||
|
return alternate['@_href'];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback to first link with href, skip rel="self"
|
||||||
|
const firstLink = links.find(l => l['@_href'] && l['@_rel'] !== 'self');
|
||||||
|
if (firstLink?.['@_href']) {
|
||||||
|
return firstLink['@_href'];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Last resort: any href
|
||||||
|
const anyLink = links.find(l => l['@_href']);
|
||||||
|
return anyLink?.['@_href'];
|
||||||
|
}
|
||||||
|
|
||||||
|
private extractDate(entry: AtomEntry): Date {
|
||||||
|
// Prefer <published>, fallback to <updated>
|
||||||
|
if (entry.published) {
|
||||||
|
return parseDate(entry.published);
|
||||||
|
}
|
||||||
|
if (entry.updated) {
|
||||||
|
return parseDate(entry.updated);
|
||||||
|
}
|
||||||
|
return new Date();
|
||||||
|
}
|
||||||
|
|
||||||
|
private cleanText(text: string | AtomTextField | undefined): string {
|
||||||
|
if (!text) return '';
|
||||||
|
// Handle object with #text property (when element has attributes like type="html")
|
||||||
|
const textValue = typeof text === 'string' ? text : text['#text'] || '';
|
||||||
|
// Remove CDATA wrappers if present
|
||||||
|
return textValue
|
||||||
|
.replace(/^<!\[CDATA\[/, '')
|
||||||
|
.replace(/\]\]>$/, '')
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
private extractTitle(entry: AtomEntry): string {
|
||||||
|
if (!entry.title) {
|
||||||
|
throw new Error('Atom entry missing required field: title');
|
||||||
|
}
|
||||||
|
return this.cleanText(entry.title);
|
||||||
|
}
|
||||||
|
|
||||||
|
supports(contentType: string): boolean {
|
||||||
|
return contentType.toLowerCase().includes('atom');
|
||||||
|
}
|
||||||
|
}
|
||||||
3
modules/parser/index.ts
Normal file
3
modules/parser/index.ts
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
export { RssParser } from './rss.parser.js';
|
||||||
|
export { AtomParser } from './atom.parser.js';
|
||||||
|
export { generateId, parseDate } from './utils.js';
|
||||||
192
modules/parser/rss.parser.test.ts
Normal file
192
modules/parser/rss.parser.test.ts
Normal file
@ -0,0 +1,192 @@
|
|||||||
|
import { describe, it, expect } from 'vitest';
|
||||||
|
import { RssParser } from './rss.parser.js';
|
||||||
|
|
||||||
|
describe('RssParser', () => {
|
||||||
|
const parser = new RssParser();
|
||||||
|
|
||||||
|
describe('parse', () => {
|
||||||
|
it('parses valid RSS 2.0 feed with all fields', async () => {
|
||||||
|
const xml = `<?xml version="1.0"?>
|
||||||
|
<rss version="2.0">
|
||||||
|
<channel>
|
||||||
|
<item>
|
||||||
|
<title>Test Article</title>
|
||||||
|
<link>https://example.com/article</link>
|
||||||
|
<description>This is a summary</description>
|
||||||
|
<content:encoded><![CDATA[<p>Full content</p>]]></content:encoded>
|
||||||
|
<pubDate>Mon, 06 Sep 2024 09:00:00 GMT</pubDate>
|
||||||
|
</item>
|
||||||
|
</channel>
|
||||||
|
</rss>`;
|
||||||
|
|
||||||
|
const items = await parser.parse(xml, 'https://example.com/feed.xml');
|
||||||
|
|
||||||
|
expect(items).toHaveLength(1);
|
||||||
|
expect(items[0].title).toBe('Test Article');
|
||||||
|
expect(items[0].url).toBe('https://example.com/article');
|
||||||
|
expect(items[0].summary).toBe('This is a summary');
|
||||||
|
expect(items[0].content).toBe('<p>Full content</p>');
|
||||||
|
expect(items[0].publishedAt).toEqual(new Date('Mon, 06 Sep 2024 09:00:00 GMT'));
|
||||||
|
expect(items[0].source).toBe('https://example.com/feed.xml');
|
||||||
|
expect(items[0].id).toBeDefined();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('parses RSS with only required fields', async () => {
|
||||||
|
const xml = `<?xml version="1.0"?>
|
||||||
|
<rss version="2.0">
|
||||||
|
<channel>
|
||||||
|
<item>
|
||||||
|
<title>Minimal Article</title>
|
||||||
|
<link>https://example.com/minimal</link>
|
||||||
|
</item>
|
||||||
|
</channel>
|
||||||
|
</rss>`;
|
||||||
|
|
||||||
|
const items = await parser.parse(xml, 'https://example.com/feed.xml');
|
||||||
|
|
||||||
|
expect(items).toHaveLength(1);
|
||||||
|
expect(items[0].title).toBe('Minimal Article');
|
||||||
|
expect(items[0].url).toBe('https://example.com/minimal');
|
||||||
|
expect(items[0].summary).toBeUndefined();
|
||||||
|
expect(items[0].content).toBeUndefined();
|
||||||
|
expect(items[0].publishedAt).toBeInstanceOf(Date);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('parses multiple items', async () => {
|
||||||
|
const xml = `<?xml version="1.0"?>
|
||||||
|
<rss version="2.0">
|
||||||
|
<channel>
|
||||||
|
<item>
|
||||||
|
<title>Article 1</title>
|
||||||
|
<link>https://example.com/1</link>
|
||||||
|
<pubDate>Mon, 06 Sep 2024 09:00:00 GMT</pubDate>
|
||||||
|
</item>
|
||||||
|
<item>
|
||||||
|
<title>Article 2</title>
|
||||||
|
<link>https://example.com/2</link>
|
||||||
|
<pubDate>Tue, 07 Sep 2024 10:00:00 GMT</pubDate>
|
||||||
|
</item>
|
||||||
|
</channel>
|
||||||
|
</rss>`;
|
||||||
|
|
||||||
|
const items = await parser.parse(xml, 'https://example.com/feed.xml');
|
||||||
|
|
||||||
|
expect(items).toHaveLength(2);
|
||||||
|
expect(items[0].title).toBe('Article 1');
|
||||||
|
expect(items[1].title).toBe('Article 2');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns empty array when no items', async () => {
|
||||||
|
const xml = `<?xml version="1.0"?>
|
||||||
|
<rss version="2.0">
|
||||||
|
<channel>
|
||||||
|
<title>Empty Feed</title>
|
||||||
|
</channel>
|
||||||
|
</rss>`;
|
||||||
|
|
||||||
|
const items = await parser.parse(xml, 'https://example.com/feed.xml');
|
||||||
|
|
||||||
|
expect(items).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('throws on missing title', async () => {
|
||||||
|
const xml = `<?xml version="1.0"?>
|
||||||
|
<rss version="2.0">
|
||||||
|
<channel>
|
||||||
|
<item>
|
||||||
|
<link>https://example.com/article</link>
|
||||||
|
</item>
|
||||||
|
</channel>
|
||||||
|
</rss>`;
|
||||||
|
|
||||||
|
await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
|
||||||
|
'missing required field: title'
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('throws on missing link', async () => {
|
||||||
|
const xml = `<?xml version="1.0"?>
|
||||||
|
<rss version="2.0">
|
||||||
|
<channel>
|
||||||
|
<item>
|
||||||
|
<title>Article Without Link</title>
|
||||||
|
</item>
|
||||||
|
</channel>
|
||||||
|
</rss>`;
|
||||||
|
|
||||||
|
await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
|
||||||
|
'missing required field: link'
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('throws on invalid XML', async () => {
|
||||||
|
const xml = 'not xml at all';
|
||||||
|
|
||||||
|
await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
|
||||||
|
'Invalid XML'
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('throws on missing rss root element', async () => {
|
||||||
|
const xml = '<?xml version="1.0"?><feed></feed>';
|
||||||
|
|
||||||
|
await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
|
||||||
|
'missing <rss> root element'
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('parses ISO 8601 date as fallback', async () => {
|
||||||
|
const xml = `<?xml version="1.0"?>
|
||||||
|
<rss version="2.0">
|
||||||
|
<channel>
|
||||||
|
<item>
|
||||||
|
<title>Test</title>
|
||||||
|
<link>https://example.com/article</link>
|
||||||
|
<pubDate>2024-09-06T09:00:00Z</pubDate>
|
||||||
|
</item>
|
||||||
|
</channel>
|
||||||
|
</rss>`;
|
||||||
|
|
||||||
|
const items = await parser.parse(xml, 'https://example.com/feed.xml');
|
||||||
|
|
||||||
|
expect(items[0].publishedAt).toEqual(new Date('2024-09-06T09:00:00Z'));
|
||||||
|
});
|
||||||
|
|
||||||
|
it('generates deterministic IDs', async () => {
|
||||||
|
const xml = `<?xml version="1.0"?>
|
||||||
|
<rss version="2.0">
|
||||||
|
<channel>
|
||||||
|
<item>
|
||||||
|
<title>Test</title>
|
||||||
|
<link>https://example.com/article</link>
|
||||||
|
<pubDate>Mon, 06 Sep 2024 09:00:00 GMT</pubDate>
|
||||||
|
</item>
|
||||||
|
</channel>
|
||||||
|
</rss>`;
|
||||||
|
|
||||||
|
const items1 = await parser.parse(xml, 'https://example.com/feed.xml');
|
||||||
|
const items2 = await parser.parse(xml, 'https://example.com/feed.xml');
|
||||||
|
|
||||||
|
expect(items1[0].id).toBe(items2[0].id);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('supports', () => {
|
||||||
|
it('returns true for application/rss+xml', () => {
|
||||||
|
expect(parser.supports('application/rss+xml')).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns true for text/xml with rss', () => {
|
||||||
|
expect(parser.supports('text/xml')).toBe(false); // Not strictly RSS
|
||||||
|
expect(parser.supports('application/rss')).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('returns false for atom content type', () => {
|
||||||
|
expect(parser.supports('application/atom+xml')).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('is case insensitive', () => {
|
||||||
|
expect(parser.supports('APPLICATION/RSS+XML')).toBe(true);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
110
modules/parser/rss.parser.ts
Normal file
110
modules/parser/rss.parser.ts
Normal file
@ -0,0 +1,110 @@
|
|||||||
|
import type { FeedItem } from '../../interfaces/feed.types.js';
|
||||||
|
import type { IParser } from '../../interfaces/parser.interface.js';
|
||||||
|
import { XMLParser } from 'fast-xml-parser';
|
||||||
|
import { generateId, parseDate, isValidXml } from './utils.js';
|
||||||
|
|
||||||
|
interface RssChannel {
|
||||||
|
title?: string;
|
||||||
|
link?: string;
|
||||||
|
description?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface RssItem {
|
||||||
|
title?: string;
|
||||||
|
link?: string;
|
||||||
|
description?: string;
|
||||||
|
'content:encoded'?: string;
|
||||||
|
pubDate?: string;
|
||||||
|
guid?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface RssFeed {
|
||||||
|
rss?: {
|
||||||
|
channel?: {
|
||||||
|
item?: RssItem[] | RssItem;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parser for RSS 2.0 feeds.
|
||||||
|
*/
|
||||||
|
export class RssParser implements IParser {
|
||||||
|
private xmlParser: XMLParser;
|
||||||
|
|
||||||
|
constructor() {
|
||||||
|
this.xmlParser = new XMLParser({
|
||||||
|
ignoreAttributes: false,
|
||||||
|
attributeNamePrefix: '@_',
|
||||||
|
parseAttributeValue: false,
|
||||||
|
trimValues: true,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async parse(xml: string, source: string): Promise<FeedItem[]> {
|
||||||
|
if (!isValidXml(xml)) {
|
||||||
|
throw new Error('Invalid XML: does not appear to be valid RSS/XML');
|
||||||
|
}
|
||||||
|
|
||||||
|
let parsed: RssFeed;
|
||||||
|
try {
|
||||||
|
parsed = this.xmlParser.parse(xml) as RssFeed;
|
||||||
|
} catch (error) {
|
||||||
|
throw new Error(`XML parsing failed: ${(error as Error).message}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!parsed.rss) {
|
||||||
|
throw new Error('Invalid RSS: missing <rss> root element');
|
||||||
|
}
|
||||||
|
|
||||||
|
const channel = parsed.rss.channel;
|
||||||
|
if (!channel) {
|
||||||
|
throw new Error('Invalid RSS: missing <channel> element');
|
||||||
|
}
|
||||||
|
|
||||||
|
const items = channel.item;
|
||||||
|
if (!items) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const itemArray = Array.isArray(items) ? items : [items];
|
||||||
|
|
||||||
|
return itemArray.map((item) => this.parseItem(item, source));
|
||||||
|
}
|
||||||
|
|
||||||
|
private parseItem(item: RssItem, source: string): FeedItem {
|
||||||
|
if (!item.title) {
|
||||||
|
throw new Error('RSS item missing required field: title');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!item.link) {
|
||||||
|
throw new Error('RSS item missing required field: link');
|
||||||
|
}
|
||||||
|
|
||||||
|
const publishedAt = item.pubDate ? parseDate(item.pubDate) : new Date();
|
||||||
|
const url = item.link;
|
||||||
|
|
||||||
|
return {
|
||||||
|
id: generateId(url, publishedAt),
|
||||||
|
source,
|
||||||
|
title: this.cleanText(item.title),
|
||||||
|
url,
|
||||||
|
publishedAt,
|
||||||
|
summary: item.description ? this.cleanText(item.description) : undefined,
|
||||||
|
content: item['content:encoded'] ? this.cleanText(item['content:encoded']) : undefined,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private cleanText(text: string): string {
|
||||||
|
if (!text) return '';
|
||||||
|
// Remove CDATA wrappers if present
|
||||||
|
return text
|
||||||
|
.replace(/^<!\[CDATA\[/, '')
|
||||||
|
.replace(/\]\]>$/, '')
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
supports(contentType: string): boolean {
|
||||||
|
return contentType.toLowerCase().includes('rss');
|
||||||
|
}
|
||||||
|
}
|
||||||
75
modules/parser/utils.ts
Normal file
75
modules/parser/utils.ts
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
/**
|
||||||
|
* Utility functions for the parser module.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generates a deterministic hash from URL and published date.
|
||||||
|
* Uses djb2 algorithm for fast, consistent hashing.
|
||||||
|
*/
|
||||||
|
export function generateId(url: string, publishedAt: Date): string {
|
||||||
|
const str = `${url}::${publishedAt.toISOString()}`;
|
||||||
|
let hash = 5381;
|
||||||
|
for (let i = 0; i < str.length; i++) {
|
||||||
|
hash = ((hash << 5) + hash) + str.charCodeAt(i);
|
||||||
|
}
|
||||||
|
return (hash >>> 0).toString(16); // Convert to unsigned and hex
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parses dates from RSS (RFC 822) and Atom (ISO 8601) formats.
|
||||||
|
* Tries multiple formats for robustness.
|
||||||
|
* Throws if date cannot be parsed.
|
||||||
|
*/
|
||||||
|
export function parseDate(dateStr: string): Date {
|
||||||
|
if (!dateStr || typeof dateStr !== 'string') {
|
||||||
|
throw new Error(`Invalid date string: ${dateStr}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const trimmed = dateStr.trim();
|
||||||
|
|
||||||
|
// Try native Date parsing first (handles ISO 8601 and many common formats)
|
||||||
|
let date = new Date(trimmed);
|
||||||
|
if (!isNaN(date.getTime())) {
|
||||||
|
return date;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try RFC 822 format: Mon, 06 Sep 2024 09:00:00 GMT
|
||||||
|
const rfc822Match = trimmed.match(/^\w{3},?\s+(\d{1,2})\s+(\w{3})\s+(\d{4})\s+(\d{1,2}):(\d{2}):(\d{2})\s*(?:GMT|UTC|[+-]\d{4})?$/i);
|
||||||
|
if (rfc822Match) {
|
||||||
|
const months: { [key: string]: number } = {
|
||||||
|
jan: 0, feb: 1, mar: 2, apr: 3, may: 4, jun: 5,
|
||||||
|
jul: 6, aug: 7, sep: 8, oct: 9, nov: 10, dec: 11
|
||||||
|
};
|
||||||
|
const month = months[rfc822Match[2].toLowerCase()];
|
||||||
|
if (month !== undefined) {
|
||||||
|
date = new Date(
|
||||||
|
parseInt(rfc822Match[3]),
|
||||||
|
month,
|
||||||
|
parseInt(rfc822Match[1]),
|
||||||
|
parseInt(rfc822Match[4]),
|
||||||
|
parseInt(rfc822Match[5]),
|
||||||
|
parseInt(rfc822Match[6])
|
||||||
|
);
|
||||||
|
if (!isNaN(date.getTime())) {
|
||||||
|
return date;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new Error(`Unable to parse date: ${dateStr}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validates if a string is valid XML.
|
||||||
|
* Basic check for XML declaration or root element.
|
||||||
|
*/
|
||||||
|
export function isValidXml(xml: string): boolean {
|
||||||
|
if (!xml || typeof xml !== 'string') {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
const trimmed = xml.trim();
|
||||||
|
return trimmed.startsWith('<?xml') ||
|
||||||
|
trimmed.startsWith('<rss') ||
|
||||||
|
trimmed.startsWith('<feed') ||
|
||||||
|
trimmed.startsWith('<channel');
|
||||||
|
}
|
||||||
76
package-lock.json
generated
76
package-lock.json
generated
@ -8,6 +8,7 @@
|
|||||||
"name": "pulse",
|
"name": "pulse",
|
||||||
"version": "0.1.0",
|
"version": "0.1.0",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
|
"fast-xml-parser": "^5.7.3",
|
||||||
"undici": "^6.21.0"
|
"undici": "^6.21.0"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
@ -466,6 +467,18 @@
|
|||||||
"dev": true,
|
"dev": true,
|
||||||
"license": "MIT"
|
"license": "MIT"
|
||||||
},
|
},
|
||||||
|
"node_modules/@nodable/entities": {
|
||||||
|
"version": "2.1.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/@nodable/entities/-/entities-2.1.0.tgz",
|
||||||
|
"integrity": "sha512-nyT7T3nbMyBI/lvr6L5TyWbFJAI9FTgVRakNoBqCD+PmID8DzFrrNdLLtHMwMszOtqZa8PAOV24ZqDnQrhQINA==",
|
||||||
|
"funding": [
|
||||||
|
{
|
||||||
|
"type": "github",
|
||||||
|
"url": "https://github.com/sponsors/nodable"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"license": "MIT"
|
||||||
|
},
|
||||||
"node_modules/@rollup/rollup-android-arm-eabi": {
|
"node_modules/@rollup/rollup-android-arm-eabi": {
|
||||||
"version": "4.60.3",
|
"version": "4.60.3",
|
||||||
"resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.60.3.tgz",
|
"resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.60.3.tgz",
|
||||||
@ -1090,6 +1103,42 @@
|
|||||||
"node": ">=12.0.0"
|
"node": ">=12.0.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/fast-xml-builder": {
|
||||||
|
"version": "1.1.8",
|
||||||
|
"resolved": "https://registry.npmjs.org/fast-xml-builder/-/fast-xml-builder-1.1.8.tgz",
|
||||||
|
"integrity": "sha512-sDVBc2gg8pSKvcbE8rBmOyjSGQf0AdsbqvHeIOv3D/uYNoV4eCReQXyDF8Pdv8+m1FHazACypSz2hR7O2S1LLw==",
|
||||||
|
"funding": [
|
||||||
|
{
|
||||||
|
"type": "github",
|
||||||
|
"url": "https://github.com/sponsors/NaturalIntelligence"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"path-expression-matcher": "^1.1.3"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"node_modules/fast-xml-parser": {
|
||||||
|
"version": "5.7.3",
|
||||||
|
"resolved": "https://registry.npmjs.org/fast-xml-parser/-/fast-xml-parser-5.7.3.tgz",
|
||||||
|
"integrity": "sha512-C0AaNuC+mscy6vrAQKAc/rMq+zAPHodfHGZu4sGVehvAQt/JLG1O5zEcYcXSY5zSqr4YVgxsB+pHXTq0i7eDlg==",
|
||||||
|
"funding": [
|
||||||
|
{
|
||||||
|
"type": "github",
|
||||||
|
"url": "https://github.com/sponsors/NaturalIntelligence"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"license": "MIT",
|
||||||
|
"dependencies": {
|
||||||
|
"@nodable/entities": "^2.1.0",
|
||||||
|
"fast-xml-builder": "^1.1.7",
|
||||||
|
"path-expression-matcher": "^1.5.0",
|
||||||
|
"strnum": "^2.2.3"
|
||||||
|
},
|
||||||
|
"bin": {
|
||||||
|
"fxparser": "src/cli/cli.js"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/fsevents": {
|
"node_modules/fsevents": {
|
||||||
"version": "2.3.3",
|
"version": "2.3.3",
|
||||||
"resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz",
|
"resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz",
|
||||||
@ -1161,6 +1210,21 @@
|
|||||||
"node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1"
|
"node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/path-expression-matcher": {
|
||||||
|
"version": "1.5.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/path-expression-matcher/-/path-expression-matcher-1.5.0.tgz",
|
||||||
|
"integrity": "sha512-cbrerZV+6rvdQrrD+iGMcZFEiiSrbv9Tfdkvnusy6y0x0GKBXREFg/Y65GhIfm0tnLntThhzCnfKwp1WRjeCyQ==",
|
||||||
|
"funding": [
|
||||||
|
{
|
||||||
|
"type": "github",
|
||||||
|
"url": "https://github.com/sponsors/NaturalIntelligence"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"license": "MIT",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=14.0.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/pathe": {
|
"node_modules/pathe": {
|
||||||
"version": "1.1.2",
|
"version": "1.1.2",
|
||||||
"resolved": "https://registry.npmjs.org/pathe/-/pathe-1.1.2.tgz",
|
"resolved": "https://registry.npmjs.org/pathe/-/pathe-1.1.2.tgz",
|
||||||
@ -1300,6 +1364,18 @@
|
|||||||
"dev": true,
|
"dev": true,
|
||||||
"license": "MIT"
|
"license": "MIT"
|
||||||
},
|
},
|
||||||
|
"node_modules/strnum": {
|
||||||
|
"version": "2.2.3",
|
||||||
|
"resolved": "https://registry.npmjs.org/strnum/-/strnum-2.2.3.tgz",
|
||||||
|
"integrity": "sha512-oKx6RUCuHfT3oyVjtnrmn19H1SiCqgJSg+54XqURKp5aCMbrXrhLjRN9TjuwMjiYstZ0MzDrHqkGZ5dFTKd+zg==",
|
||||||
|
"funding": [
|
||||||
|
{
|
||||||
|
"type": "github",
|
||||||
|
"url": "https://github.com/sponsors/NaturalIntelligence"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"license": "MIT"
|
||||||
|
},
|
||||||
"node_modules/tinybench": {
|
"node_modules/tinybench": {
|
||||||
"version": "2.9.0",
|
"version": "2.9.0",
|
||||||
"resolved": "https://registry.npmjs.org/tinybench/-/tinybench-2.9.0.tgz",
|
"resolved": "https://registry.npmjs.org/tinybench/-/tinybench-2.9.0.tgz",
|
||||||
|
|||||||
@ -14,6 +14,7 @@
|
|||||||
"vitest": "^2.1.0"
|
"vitest": "^2.1.0"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
|
"fast-xml-parser": "^5.7.3",
|
||||||
"undici": "^6.21.0"
|
"undici": "^6.21.0"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user