From 2ea1f5cd954bad8ccea42c6907646abb453b09be Mon Sep 17 00:00:00 2001
From: Edo Limburg <edolimburg@live.nl>
Date: Tue, 5 May 2026 21:15:20 +0200
Subject: [PATCH] Add parser module with RSS and Atom feed parsers

- Create separate RssParser and AtomParser implementing IParser interface
- Add utility functions for ID generation (djb2 hash) and date parsing
- Support both RSS (RFC 822) and Atom (ISO 8601) date formats
- Handle Atom elements with attributes (type="html") via #text property
- Map RSS <description> to summary and <content:encoded> to content
- Map Atom <summary> to summary and <content> to content
- Prefer Atom link[@rel="alternate"] for article URLs
- Throw descriptive errors for malformed XML and missing required fields
- Add comprehensive test coverage for both parsers (32 tests total)
---
 modules/parser/atom.parser.test.ts | 236 +++++++++++++++++++++++++++++
 modules/parser/atom.parser.ts      | 152 +++++++++++++++++++
 modules/parser/index.ts            |   3 +
 modules/parser/rss.parser.test.ts  | 192 +++++++++++++++++++++++
 modules/parser/rss.parser.ts       | 110 ++++++++++++++
 modules/parser/utils.ts            |  75 +++++++++
 package-lock.json                  |  76 ++++++++++
 package.json                       |   1 +
 8 files changed, 845 insertions(+)
 create mode 100644 modules/parser/atom.parser.test.ts
 create mode 100644 modules/parser/atom.parser.ts
 create mode 100644 modules/parser/index.ts
 create mode 100644 modules/parser/rss.parser.test.ts
 create mode 100644 modules/parser/rss.parser.ts
 create mode 100644 modules/parser/utils.ts
diff --git a/modules/parser/atom.parser.test.ts b/modules/parser/atom.parser.test.ts
new file mode 100644
index 0000000..7826dcd
--- /dev/null
+++ b/modules/parser/atom.parser.test.ts
@@ -0,0 +1,236 @@
+import { describe, it, expect } from 'vitest';
+import { AtomParser } from './atom.parser.js';
+
+describe('AtomParser', () => {
+  const parser = new AtomParser();
+
+  describe('parse', () => {
+    it('parses valid Atom feed with all fields', async () => {
+      const xml = `<?xml version="1.0"?>
+        <feed xmlns="http://www.w3.org/2005/Atom">
+          <entry>
+            <title>Test Article</title>
+            <link rel="alternate" href="https://example.com/article"/>
+            <summary type="html"><![CDATA[This is a summary]]></summary>
+            <content type="html"><![CDATA[<p>Full content</p>]]></content>
+            <published>2024-09-06T09:00:00Z</published>
+          </entry>
+        </feed>`;
+
+      const items = await parser.parse(xml, 'https://example.com/feed.xml');
+
+      expect(items).toHaveLength(1);
+      expect(items[0].title).toBe('Test Article');
+      expect(items[0].url).toBe('https://example.com/article');
+      expect(items[0].summary).toBe('This is a summary');
+      expect(items[0].content).toBe('<p>Full content</p>');
+      expect(items[0].publishedAt).toEqual(new Date('2024-09-06T09:00:00Z'));
+      expect(items[0].source).toBe('https://example.com/feed.xml');
+      expect(items[0].id).toBeDefined();
+    });
+
+    it('parses Atom with only required fields', async () => {
+      const xml = `<?xml version="1.0"?>
+        <feed xmlns="http://www.w3.org/2005/Atom">
+          <entry>
+            <title>Minimal Article</title>
+            <link href="https://example.com/minimal"/>
+          </entry>
+        </feed>`;
+
+      const items = await parser.parse(xml, 'https://example.com/feed.xml');
+
+      expect(items).toHaveLength(1);
+      expect(items[0].title).toBe('Minimal Article');
+      expect(items[0].url).toBe('https://example.com/minimal');
+      expect(items[0].summary).toBeUndefined();
+      expect(items[0].content).toBeUndefined();
+      expect(items[0].publishedAt).toBeInstanceOf(Date);
+    });
+
+    it('parses multiple entries', async () => {
+      const xml = `<?xml version="1.0"?>
+        <feed xmlns="http://www.w3.org/2005/Atom">
+          <entry>
+            <title>Article 1</title>
+            <link href="https://example.com/1"/>
+            <published>2024-09-06T09:00:00Z</published>
+          </entry>
+          <entry>
+            <title>Article 2</title>
+            <link href="https://example.com/2"/>
+            <published>2024-09-07T10:00:00Z</published>
+          </entry>
+        </feed>`;
+
+      const items = await parser.parse(xml, 'https://example.com/feed.xml');
+
+      expect(items).toHaveLength(2);
+      expect(items[0].title).toBe('Article 1');
+      expect(items[1].title).toBe('Article 2');
+    });
+
+    it('returns empty array when no entries', async () => {
+      const xml = `<?xml version="1.0"?>
+        <feed xmlns="http://www.w3.org/2005/Atom">
+          <title>Empty Feed</title>
+        </feed>`;
+
+      const items = await parser.parse(xml, 'https://example.com/feed.xml');
+
+      expect(items).toHaveLength(0);
+    });
+
+    it('prefers rel="alternate" link', async () => {
+      const xml = `<?xml version="1.0"?>
+        <feed xmlns="http://www.w3.org/2005/Atom">
+          <entry>
+            <title>Test</title>
+            <link rel="self" href="https://example.com/feed"/>
+            <link rel="alternate" href="https://example.com/article"/>
+          </entry>
+        </feed>`;
+
+      const items = await parser.parse(xml, 'https://example.com/feed.xml');
+
+      expect(items[0].url).toBe('https://example.com/article');
+    });
+
+    it('falls back to first non-self link', async () => {
+      const xml = `<?xml version="1.0"?>
+        <feed xmlns="http://www.w3.org/2005/Atom">
+          <entry>
+            <title>Test</title>
+            <link rel="self" href="https://example.com/feed"/>
+            <link href="https://example.com/article"/>
+          </entry>
+        </feed>`;
+
+      const items = await parser.parse(xml, 'https://example.com/feed.xml');
+
+      expect(items[0].url).toBe('https://example.com/article');
+    });
+
+    it('throws on missing title', async () => {
+      const xml = `<?xml version="1.0"?>
+        <feed xmlns="http://www.w3.org/2005/Atom">
+          <entry>
+            <link href="https://example.com/article"/>
+          </entry>
+        </feed>`;
+
+      await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
+        'missing required field: title'
+      );
+    });
+
+    it('throws on missing link with href', async () => {
+      const xml = `<?xml version="1.0"?>
+        <feed xmlns="http://www.w3.org/2005/Atom">
+          <entry>
+            <title>Article Without Link</title>
+          </entry>
+        </feed>`;
+
+      await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
+        'missing required field: link with href'
+      );
+    });
+
+    it('throws on invalid XML', async () => {
+      const xml = 'not xml at all';
+
+      await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
+        'Invalid XML'
+      );
+    });
+
+    it('throws on missing feed root element', async () => {
+      const xml = '<?xml version="1.0"?><rss></rss>';
+
+      await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
+        'missing <feed> root element'
+      );
+    });
+
+    it('uses <updated> when <published> is missing', async () => {
+      const xml = `<?xml version="1.0"?>
+        <feed xmlns="http://www.w3.org/2005/Atom">
+          <entry>
+            <title>Test</title>
+            <link href="https://example.com/article"/>
+            <updated>2024-09-06T09:00:00Z</updated>
+          </entry>
+        </feed>`;
+
+      const items = await parser.parse(xml, 'https://example.com/feed.xml');
+
+      expect(items[0].publishedAt).toEqual(new Date('2024-09-06T09:00:00Z'));
+    });
+
+    it('prefers <published> over <updated>', async () => {
+      const xml = `<?xml version="1.0"?>
+        <feed xmlns="http://www.w3.org/2005/Atom">
+          <entry>
+            <title>Test</title>
+            <link href="https://example.com/article"/>
+            <published>2024-09-06T09:00:00Z</published>
+            <updated>2024-09-07T10:00:00Z</updated>
+          </entry>
+        </feed>`;
+
+      const items = await parser.parse(xml, 'https://example.com/feed.xml');
+
+      expect(items[0].publishedAt).toEqual(new Date('2024-09-06T09:00:00Z'));
+    });
+
+    it('generates deterministic IDs', async () => {
+      const xml = `<?xml version="1.0"?>
+        <feed xmlns="http://www.w3.org/2005/Atom">
+          <entry>
+            <title>Test</title>
+            <link href="https://example.com/article"/>
+            <published>2024-09-06T09:00:00Z</published>
+          </entry>
+        </feed>`;
+
+      const items1 = await parser.parse(xml, 'https://example.com/feed.xml');
+      const items2 = await parser.parse(xml, 'https://example.com/feed.xml');
+
+      expect(items1[0].id).toBe(items2[0].id);
+    });
+
+    it('handles multiple links in array format', async () => {
+      const xml = `<?xml version="1.0"?>
+        <feed xmlns="http://www.w3.org/2005/Atom">
+          <entry>
+            <title>Test</title>
+            <link rel="self" href="https://example.com/feed"/>
+            <link rel="alternate" href="https://example.com/article"/>
+          </entry>
+        </feed>`;
+
+      const items = await parser.parse(xml, 'https://example.com/feed.xml');
+
+      expect(items[0].url).toBe('https://example.com/article');
+    });
+  });
+
+  describe('supports', () => {
+    it('returns true for application/atom+xml', () => {
+      expect(parser.supports('application/atom+xml')).toBe(true);
+    });
+
+    it('returns true for atom in content type', () => {
+      expect(parser.supports('application/atom')).toBe(true);
+    });
+
+    it('returns false for rss content type', () => {
+      expect(parser.supports('application/rss+xml')).toBe(false);
+    });
+
+    it('is case insensitive', () => {
+      expect(parser.supports('APPLICATION/ATOM+XML')).toBe(true);
+    });
+  });
+});
diff --git a/modules/parser/atom.parser.ts b/modules/parser/atom.parser.ts
new file mode 100644
index 0000000..8916fca
--- /dev/null
+++ b/modules/parser/atom.parser.ts
@@ -0,0 +1,152 @@
+import type { FeedItem } from '../../interfaces/feed.types.js';
+import type { IParser } from '../../interfaces/parser.interface.js';
+import { XMLParser } from 'fast-xml-parser';
+import { generateId, parseDate, isValidXml } from './utils.js';
+
+interface AtomLink {
+  '@_href'?: string;
+  '@_rel'?: string;
+}
+
+interface AtomTextField {
+  '#text'?: string;
+  '@_type'?: string;
+}
+
+interface AtomEntry {
+  title?: string | AtomTextField;
+  link?: AtomLink[] | AtomLink;
+  summary?: string | AtomTextField;
+  content?: string | AtomTextField;
+  published?: string;
+  updated?: string;
+  id?: string;
+}
+
+interface AtomFeed {
+  feed?: {
+    entry?: AtomEntry[] | AtomEntry;
+  };
+}
+
+/**
+ * Parser for Atom 1.0 feeds.
+ */
+export class AtomParser implements IParser {
+  private xmlParser: XMLParser;
+
+  constructor() {
+    this.xmlParser = new XMLParser({
+      ignoreAttributes: false,
+      attributeNamePrefix: '@_',
+      parseAttributeValue: false,
+      trimValues: true,
+    });
+  }
+
+  async parse(xml: string, source: string): Promise<FeedItem[]> {
+    if (!isValidXml(xml)) {
+      throw new Error('Invalid XML: does not appear to be valid Atom/XML');
+    }
+
+    let parsed: AtomFeed;
+    try {
+      parsed = this.xmlParser.parse(xml) as AtomFeed;
+    } catch (error) {
+      throw new Error(`XML parsing failed: ${(error as Error).message}`);
+    }
+
+    if (!parsed.feed) {
+      throw new Error('Invalid Atom: missing <feed> root element');
+    }
+
+    const entries = parsed.feed.entry;
+    if (!entries) {
+      return [];
+    }
+
+    const entryArray = Array.isArray(entries) ? entries : [entries];
+
+    return entryArray.map((entry) => this.parseEntry(entry, source));
+  }
+
+  private parseEntry(entry: AtomEntry, source: string): FeedItem {
+    if (!entry.title) {
+      throw new Error('Atom entry missing required field: title');
+    }
+
+    const url = this.extractUrl(entry);
+    if (!url) {
+      throw new Error('Atom entry missing required field: link with href');
+    }
+
+    const publishedAt = this.extractDate(entry);
+
+    return {
+      id: generateId(url, publishedAt),
+      source,
+      title: this.cleanText(entry.title),
+      url,
+      publishedAt,
+      summary: entry.summary ? this.cleanText(entry.summary) : undefined,
+      content: entry.content ? this.cleanText(entry.content) : undefined,
+    };
+  }
+
+  private extractUrl(entry: AtomEntry): string | undefined {
+    if (!entry.link) {
+      return undefined;
+    }
+
+    const links = Array.isArray(entry.link) ? entry.link : [entry.link];
+    
+    // Prefer rel="alternate" (the actual article link)
+    const alternate = links.find(l => l['@_rel'] === 'alternate');
+    if (alternate?.['@_href']) {
+      return alternate['@_href'];
+    }
+
+    // Fallback to first link with href, skip rel="self"
+    const firstLink = links.find(l => l['@_href'] && l['@_rel'] !== 'self');
+    if (firstLink?.['@_href']) {
+      return firstLink['@_href'];
+    }
+
+    // Last resort: any href
+    const anyLink = links.find(l => l['@_href']);
+    return anyLink?.['@_href'];
+  }
+
+  private extractDate(entry: AtomEntry): Date {
+    // Prefer <published>, fallback to <updated>
+    if (entry.published) {
+      return parseDate(entry.published);
+    }
+    if (entry.updated) {
+      return parseDate(entry.updated);
+    }
+    return new Date();
+  }
+
+  private cleanText(text: string | AtomTextField | undefined): string {
+    if (!text) return '';
+    // Handle object with #text property (when element has attributes like type="html")
+    const textValue = typeof text === 'string' ? text : text['#text'] || '';
+    // Remove CDATA wrappers if present
+    return textValue
+      .replace(/^<!\[CDATA\[/, '')
+      .replace(/\]\]>$/, '')
+      .trim();
+  }
+
+  private extractTitle(entry: AtomEntry): string {
+    if (!entry.title) {
+      throw new Error('Atom entry missing required field: title');
+    }
+    return this.cleanText(entry.title);
+  }
+
+  supports(contentType: string): boolean {
+    return contentType.toLowerCase().includes('atom');
+  }
+}
diff --git a/modules/parser/index.ts b/modules/parser/index.ts
new file mode 100644
index 0000000..4847fac
--- /dev/null
+++ b/modules/parser/index.ts
@@ -0,0 +1,3 @@
+export { RssParser } from './rss.parser.js';
+export { AtomParser } from './atom.parser.js';
+export { generateId, parseDate } from './utils.js';
diff --git a/modules/parser/rss.parser.test.ts b/modules/parser/rss.parser.test.ts
new file mode 100644
index 0000000..727f60e
--- /dev/null
+++ b/modules/parser/rss.parser.test.ts
@@ -0,0 +1,192 @@
+import { describe, it, expect } from 'vitest';
+import { RssParser } from './rss.parser.js';
+
+describe('RssParser', () => {
+  const parser = new RssParser();
+
+  describe('parse', () => {
+    it('parses valid RSS 2.0 feed with all fields', async () => {
+      const xml = `<?xml version="1.0"?>
+        <rss version="2.0">
+          <channel>
+            <item>
+              <title>Test Article</title>
+              <link>https://example.com/article</link>
+              <description>This is a summary</description>
+              <content:encoded><![CDATA[<p>Full content</p>]]></content:encoded>
+              <pubDate>Mon, 06 Sep 2024 09:00:00 GMT</pubDate>
+            </item>
+          </channel>
+        </rss>`;
+
+      const items = await parser.parse(xml, 'https://example.com/feed.xml');
+
+      expect(items).toHaveLength(1);
+      expect(items[0].title).toBe('Test Article');
+      expect(items[0].url).toBe('https://example.com/article');
+      expect(items[0].summary).toBe('This is a summary');
+      expect(items[0].content).toBe('<p>Full content</p>');
+      expect(items[0].publishedAt).toEqual(new Date('Mon, 06 Sep 2024 09:00:00 GMT'));
+      expect(items[0].source).toBe('https://example.com/feed.xml');
+      expect(items[0].id).toBeDefined();
+    });
+
+    it('parses RSS with only required fields', async () => {
+      const xml = `<?xml version="1.0"?>
+        <rss version="2.0">
+          <channel>
+            <item>
+              <title>Minimal Article</title>
+              <link>https://example.com/minimal</link>
+            </item>
+          </channel>
+        </rss>`;
+
+      const items = await parser.parse(xml, 'https://example.com/feed.xml');
+
+      expect(items).toHaveLength(1);
+      expect(items[0].title).toBe('Minimal Article');
+      expect(items[0].url).toBe('https://example.com/minimal');
+      expect(items[0].summary).toBeUndefined();
+      expect(items[0].content).toBeUndefined();
+      expect(items[0].publishedAt).toBeInstanceOf(Date);
+    });
+
+    it('parses multiple items', async () => {
+      const xml = `<?xml version="1.0"?>
+        <rss version="2.0">
+          <channel>
+            <item>
+              <title>Article 1</title>
+              <link>https://example.com/1</link>
+              <pubDate>Mon, 06 Sep 2024 09:00:00 GMT</pubDate>
+            </item>
+            <item>
+              <title>Article 2</title>
+              <link>https://example.com/2</link>
+              <pubDate>Tue, 07 Sep 2024 10:00:00 GMT</pubDate>
+            </item>
+          </channel>
+        </rss>`;
+
+      const items = await parser.parse(xml, 'https://example.com/feed.xml');
+
+      expect(items).toHaveLength(2);
+      expect(items[0].title).toBe('Article 1');
+      expect(items[1].title).toBe('Article 2');
+    });
+
+    it('returns empty array when no items', async () => {
+      const xml = `<?xml version="1.0"?>
+        <rss version="2.0">
+          <channel>
+            <title>Empty Feed</title>
+          </channel>
+        </rss>`;
+
+      const items = await parser.parse(xml, 'https://example.com/feed.xml');
+
+      expect(items).toHaveLength(0);
+    });
+
+    it('throws on missing title', async () => {
+      const xml = `<?xml version="1.0"?>
+        <rss version="2.0">
+          <channel>
+            <item>
+              <link>https://example.com/article</link>
+            </item>
+          </channel>
+        </rss>`;
+
+      await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
+        'missing required field: title'
+      );
+    });
+
+    it('throws on missing link', async () => {
+      const xml = `<?xml version="1.0"?>
+        <rss version="2.0">
+          <channel>
+            <item>
+              <title>Article Without Link</title>
+            </item>
+          </channel>
+        </rss>`;
+
+      await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
+        'missing required field: link'
+      );
+    });
+
+    it('throws on invalid XML', async () => {
+      const xml = 'not xml at all';
+
+      await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
+        'Invalid XML'
+      );
+    });
+
+    it('throws on missing rss root element', async () => {
+      const xml = '<?xml version="1.0"?><feed></feed>';
+
+      await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
+        'missing <rss> root element'
+      );
+    });
+
+    it('parses ISO 8601 date as fallback', async () => {
+      const xml = `<?xml version="1.0"?>
+        <rss version="2.0">
+          <channel>
+            <item>
+              <title>Test</title>
+              <link>https://example.com/article</link>
+              <pubDate>2024-09-06T09:00:00Z</pubDate>
+            </item>
+          </channel>
+        </rss>`;
+
+      const items = await parser.parse(xml, 'https://example.com/feed.xml');
+
+      expect(items[0].publishedAt).toEqual(new Date('2024-09-06T09:00:00Z'));
+    });
+
+    it('generates deterministic IDs', async () => {
+      const xml = `<?xml version="1.0"?>
+        <rss version="2.0">
+          <channel>
+            <item>
+              <title>Test</title>
+              <link>https://example.com/article</link>
+              <pubDate>Mon, 06 Sep 2024 09:00:00 GMT</pubDate>
+            </item>
+          </channel>
+        </rss>`;
+
+      const items1 = await parser.parse(xml, 'https://example.com/feed.xml');
+      const items2 = await parser.parse(xml, 'https://example.com/feed.xml');
+
+      expect(items1[0].id).toBe(items2[0].id);
+    });
+  });
+
+  describe('supports', () => {
+    it('returns true for application/rss+xml', () => {
+      expect(parser.supports('application/rss+xml')).toBe(true);
+    });
+
+    it('returns true for text/xml with rss', () => {
+      expect(parser.supports('text/xml')).toBe(false); // Not strictly RSS
+      expect(parser.supports('application/rss')).toBe(true);
+    });
+
+    it('returns false for atom content type', () => {
+      expect(parser.supports('application/atom+xml')).toBe(false);
+    });
+
+    it('is case insensitive', () => {
+      expect(parser.supports('APPLICATION/RSS+XML')).toBe(true);
+    });
+  });
+});
diff --git a/modules/parser/rss.parser.ts b/modules/parser/rss.parser.ts
new file mode 100644
index 0000000..e667531
--- /dev/null
+++ b/modules/parser/rss.parser.ts
@@ -0,0 +1,110 @@
+import type { FeedItem } from '../../interfaces/feed.types.js';
+import type { IParser } from '../../interfaces/parser.interface.js';
+import { XMLParser } from 'fast-xml-parser';
+import { generateId, parseDate, isValidXml } from './utils.js';
+
+interface RssChannel {
+  title?: string;
+  link?: string;
+  description?: string;
+}
+
+interface RssItem {
+  title?: string;
+  link?: string;
+  description?: string;
+  'content:encoded'?: string;
+  pubDate?: string;
+  guid?: string;
+}
+
+interface RssFeed {
+  rss?: {
+    channel?: {
+      item?: RssItem[] | RssItem;
+    };
+  };
+}
+
+/**
+ * Parser for RSS 2.0 feeds.
+ */
+export class RssParser implements IParser {
+  private xmlParser: XMLParser;
+
+  constructor() {
+    this.xmlParser = new XMLParser({
+      ignoreAttributes: false,
+      attributeNamePrefix: '@_',
+      parseAttributeValue: false,
+      trimValues: true,
+    });
+  }
+
+  async parse(xml: string, source: string): Promise<FeedItem[]> {
+    if (!isValidXml(xml)) {
+      throw new Error('Invalid XML: does not appear to be valid RSS/XML');
+    }
+
+    let parsed: RssFeed;
+    try {
+      parsed = this.xmlParser.parse(xml) as RssFeed;
+    } catch (error) {
+      throw new Error(`XML parsing failed: ${(error as Error).message}`);
+    }
+
+    if (!parsed.rss) {
+      throw new Error('Invalid RSS: missing <rss> root element');
+    }
+
+    const channel = parsed.rss.channel;
+    if (!channel) {
+      throw new Error('Invalid RSS: missing <channel> element');
+    }
+
+    const items = channel.item;
+    if (!items) {
+      return [];
+    }
+
+    const itemArray = Array.isArray(items) ? items : [items];
+
+    return itemArray.map((item) => this.parseItem(item, source));
+  }
+
+  private parseItem(item: RssItem, source: string): FeedItem {
+    if (!item.title) {
+      throw new Error('RSS item missing required field: title');
+    }
+
+    if (!item.link) {
+      throw new Error('RSS item missing required field: link');
+    }
+
+    const publishedAt = item.pubDate ? parseDate(item.pubDate) : new Date();
+    const url = item.link;
+
+    return {
+      id: generateId(url, publishedAt),
+      source,
+      title: this.cleanText(item.title),
+      url,
+      publishedAt,
+      summary: item.description ? this.cleanText(item.description) : undefined,
+      content: item['content:encoded'] ? this.cleanText(item['content:encoded']) : undefined,
+    };
+  }
+
+  private cleanText(text: string): string {
+    if (!text) return '';
+    // Remove CDATA wrappers if present
+    return text
+      .replace(/^<!\[CDATA\[/, '')
+      .replace(/\]\]>$/, '')
+      .trim();
+  }
+
+  supports(contentType: string): boolean {
+    return contentType.toLowerCase().includes('rss');
+  }
+}
diff --git a/modules/parser/utils.ts b/modules/parser/utils.ts
new file mode 100644
index 0000000..e971bd1
--- /dev/null
+++ b/modules/parser/utils.ts
@@ -0,0 +1,75 @@
+/**
+ * Utility functions for the parser module.
+ */
+
+/**
+ * Generates a deterministic hash from URL and published date.
+ * Uses djb2 algorithm for fast, consistent hashing.
+ */
+export function generateId(url: string, publishedAt: Date): string {
+  const str = `${url}::${publishedAt.toISOString()}`;
+  let hash = 5381;
+  for (let i = 0; i < str.length; i++) {
+    hash = ((hash << 5) + hash) + str.charCodeAt(i);
+  }
+  return (hash >>> 0).toString(16); // Convert to unsigned and hex
+}
+
+/**
+ * Parses dates from RSS (RFC 822) and Atom (ISO 8601) formats.
+ * Tries multiple formats for robustness.
+ * Throws if date cannot be parsed.
+ */
+export function parseDate(dateStr: string): Date {
+  if (!dateStr || typeof dateStr !== 'string') {
+    throw new Error(`Invalid date string: ${dateStr}`);
+  }
+
+  const trimmed = dateStr.trim();
+
+  // Try native Date parsing first (handles ISO 8601 and many common formats)
+  let date = new Date(trimmed);
+  if (!isNaN(date.getTime())) {
+    return date;
+  }
+
+  // Try RFC 822 format: Mon, 06 Sep 2024 09:00:00 GMT
+  const rfc822Match = trimmed.match(/^\w{3},?\s+(\d{1,2})\s+(\w{3})\s+(\d{4})\s+(\d{1,2}):(\d{2}):(\d{2})\s*(?:GMT|UTC|[+-]\d{4})?$/i);
+  if (rfc822Match) {
+    const months: { [key: string]: number } = {
+      jan: 0, feb: 1, mar: 2, apr: 3, may: 4, jun: 5,
+      jul: 6, aug: 7, sep: 8, oct: 9, nov: 10, dec: 11
+    };
+    const month = months[rfc822Match[2].toLowerCase()];
+    if (month !== undefined) {
+      date = new Date(
+        parseInt(rfc822Match[3]),
+        month,
+        parseInt(rfc822Match[1]),
+        parseInt(rfc822Match[4]),
+        parseInt(rfc822Match[5]),
+        parseInt(rfc822Match[6])
+      );
+      if (!isNaN(date.getTime())) {
+        return date;
+      }
+    }
+  }
+
+  throw new Error(`Unable to parse date: ${dateStr}`);
+}
+
+/**
+ * Validates if a string is valid XML.
+ * Basic check for XML declaration or root element.
+ */
+export function isValidXml(xml: string): boolean {
+  if (!xml || typeof xml !== 'string') {
+    return false;
+  }
+  const trimmed = xml.trim();
+  return trimmed.startsWith('<?xml') || 
+         trimmed.startsWith('<rss') || 
+         trimmed.startsWith('<feed') ||
+         trimmed.startsWith('<channel');
+}
diff --git a/package-lock.json b/package-lock.json
index 5832954..97e1bdc 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -8,6 +8,7 @@
       "name": "pulse",
       "version": "0.1.0",
       "dependencies": {
+        "fast-xml-parser": "^5.7.3",
         "undici": "^6.21.0"
       },
       "devDependencies": {
@@ -466,6 +467,18 @@
       "dev": true,
       "license": "MIT"
     },
+    "node_modules/@nodable/entities": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/@nodable/entities/-/entities-2.1.0.tgz",
+      "integrity": "sha512-nyT7T3nbMyBI/lvr6L5TyWbFJAI9FTgVRakNoBqCD+PmID8DzFrrNdLLtHMwMszOtqZa8PAOV24ZqDnQrhQINA==",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/nodable"
+        }
+      ],
+      "license": "MIT"
+    },
     "node_modules/@rollup/rollup-android-arm-eabi": {
       "version": "4.60.3",
       "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.60.3.tgz",
@@ -1090,6 +1103,42 @@
         "node": ">=12.0.0"
       }
     },
+    "node_modules/fast-xml-builder": {
+      "version": "1.1.8",
+      "resolved": "https://registry.npmjs.org/fast-xml-builder/-/fast-xml-builder-1.1.8.tgz",
+      "integrity": "sha512-sDVBc2gg8pSKvcbE8rBmOyjSGQf0AdsbqvHeIOv3D/uYNoV4eCReQXyDF8Pdv8+m1FHazACypSz2hR7O2S1LLw==",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/NaturalIntelligence"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "path-expression-matcher": "^1.1.3"
+      }
+    },
+    "node_modules/fast-xml-parser": {
+      "version": "5.7.3",
+      "resolved": "https://registry.npmjs.org/fast-xml-parser/-/fast-xml-parser-5.7.3.tgz",
+      "integrity": "sha512-C0AaNuC+mscy6vrAQKAc/rMq+zAPHodfHGZu4sGVehvAQt/JLG1O5zEcYcXSY5zSqr4YVgxsB+pHXTq0i7eDlg==",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/NaturalIntelligence"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "@nodable/entities": "^2.1.0",
+        "fast-xml-builder": "^1.1.7",
+        "path-expression-matcher": "^1.5.0",
+        "strnum": "^2.2.3"
+      },
+      "bin": {
+        "fxparser": "src/cli/cli.js"
+      }
+    },
     "node_modules/fsevents": {
       "version": "2.3.3",
       "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz",
@@ -1161,6 +1210,21 @@
         "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1"
       }
     },
+    "node_modules/path-expression-matcher": {
+      "version": "1.5.0",
+      "resolved": "https://registry.npmjs.org/path-expression-matcher/-/path-expression-matcher-1.5.0.tgz",
+      "integrity": "sha512-cbrerZV+6rvdQrrD+iGMcZFEiiSrbv9Tfdkvnusy6y0x0GKBXREFg/Y65GhIfm0tnLntThhzCnfKwp1WRjeCyQ==",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/NaturalIntelligence"
+        }
+      ],
+      "license": "MIT",
+      "engines": {
+        "node": ">=14.0.0"
+      }
+    },
     "node_modules/pathe": {
       "version": "1.1.2",
       "resolved": "https://registry.npmjs.org/pathe/-/pathe-1.1.2.tgz",
@@ -1300,6 +1364,18 @@
       "dev": true,
       "license": "MIT"
     },
+    "node_modules/strnum": {
+      "version": "2.2.3",
+      "resolved": "https://registry.npmjs.org/strnum/-/strnum-2.2.3.tgz",
+      "integrity": "sha512-oKx6RUCuHfT3oyVjtnrmn19H1SiCqgJSg+54XqURKp5aCMbrXrhLjRN9TjuwMjiYstZ0MzDrHqkGZ5dFTKd+zg==",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/NaturalIntelligence"
+        }
+      ],
+      "license": "MIT"
+    },
     "node_modules/tinybench": {
       "version": "2.9.0",
       "resolved": "https://registry.npmjs.org/tinybench/-/tinybench-2.9.0.tgz",
diff --git a/package.json b/package.json
index 42fd78a..d54be99 100644
--- a/package.json
+++ b/package.json
@@ -14,6 +14,7 @@
     "vitest": "^2.1.0"
   },
   "dependencies": {
+    "fast-xml-parser": "^5.7.3",
     "undici": "^6.21.0"
   }
 }