pulse/modules/parser/atom.parser.test.ts

import { describe, it, expect } from 'vitest';
import { AtomParser } from './atom.parser.js';

describe('AtomParser', () => {
  const parser = new AtomParser();

  describe('parse', () => {
    it('parses valid Atom feed with all fields', async () => {
      const xml = `<?xml version="1.0"?>
        <feed xmlns="http://www.w3.org/2005/Atom">
          <entry>
            <title>Test Article</title>
            <link rel="alternate" href="https://example.com/article"/>
            <summary type="html"><![CDATA[This is a summary]]></summary>
            <content type="html"><![CDATA[<p>Full content</p>]]></content>
            <published>2024-09-06T09:00:00Z</published>
          </entry>
        </feed>`;

      const items = await parser.parse(xml, 'https://example.com/feed.xml');

      expect(items).toHaveLength(1);
      expect(items[0].title).toBe('Test Article');
      expect(items[0].url).toBe('https://example.com/article');
      expect(items[0].summary).toBe('This is a summary');
      expect(items[0].content).toBe('<p>Full content</p>');
      expect(items[0].publishedAt).toEqual(new Date('2024-09-06T09:00:00Z'));
      expect(items[0].source).toBe('https://example.com/feed.xml');
      expect(items[0].id).toBeDefined();
    });

    it('parses Atom with only required fields', async () => {
      const xml = `<?xml version="1.0"?>
        <feed xmlns="http://www.w3.org/2005/Atom">
          <entry>
            <title>Minimal Article</title>
            <link href="https://example.com/minimal"/>
          </entry>
        </feed>`;

      const items = await parser.parse(xml, 'https://example.com/feed.xml');

      expect(items).toHaveLength(1);
      expect(items[0].title).toBe('Minimal Article');
      expect(items[0].url).toBe('https://example.com/minimal');
      expect(items[0].summary).toBeUndefined();
      expect(items[0].content).toBeUndefined();
      expect(items[0].publishedAt).toBeInstanceOf(Date);
    });

    it('parses multiple entries', async () => {
      const xml = `<?xml version="1.0"?>
        <feed xmlns="http://www.w3.org/2005/Atom">
          <entry>
            <title>Article 1</title>
            <link href="https://example.com/1"/>
            <published>2024-09-06T09:00:00Z</published>
          </entry>
          <entry>
            <title>Article 2</title>
            <link href="https://example.com/2"/>
            <published>2024-09-07T10:00:00Z</published>
          </entry>
        </feed>`;

      const items = await parser.parse(xml, 'https://example.com/feed.xml');

      expect(items).toHaveLength(2);
      expect(items[0].title).toBe('Article 1');
      expect(items[1].title).toBe('Article 2');
    });

    it('returns empty array when no entries', async () => {
      const xml = `<?xml version="1.0"?>
        <feed xmlns="http://www.w3.org/2005/Atom">
          <title>Empty Feed</title>
        </feed>`;

      const items = await parser.parse(xml, 'https://example.com/feed.xml');

      expect(items).toHaveLength(0);
    });

    it('prefers rel="alternate" link', async () => {
      const xml = `<?xml version="1.0"?>
        <feed xmlns="http://www.w3.org/2005/Atom">
          <entry>
            <title>Test</title>
            <link rel="self" href="https://example.com/feed"/>
            <link rel="alternate" href="https://example.com/article"/>
          </entry>
        </feed>`;

      const items = await parser.parse(xml, 'https://example.com/feed.xml');

      expect(items[0].url).toBe('https://example.com/article');
    });

    it('falls back to first non-self link', async () => {
      const xml = `<?xml version="1.0"?>
        <feed xmlns="http://www.w3.org/2005/Atom">
          <entry>
            <title>Test</title>
            <link rel="self" href="https://example.com/feed"/>
            <link href="https://example.com/article"/>
          </entry>
        </feed>`;

      const items = await parser.parse(xml, 'https://example.com/feed.xml');

      expect(items[0].url).toBe('https://example.com/article');
    });

    it('throws on missing title', async () => {
      const xml = `<?xml version="1.0"?>
        <feed xmlns="http://www.w3.org/2005/Atom">
          <entry>
            <link href="https://example.com/article"/>
          </entry>
        </feed>`;

      await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
        'missing required field: title'
      );
    });

    it('throws on missing link with href', async () => {
      const xml = `<?xml version="1.0"?>
        <feed xmlns="http://www.w3.org/2005/Atom">
          <entry>
            <title>Article Without Link</title>
          </entry>
        </feed>`;

      await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
        'missing required field: link with href'
      );
    });

    it('throws on invalid XML', async () => {
      const xml = 'not xml at all';

      await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
        'Invalid XML'
      );
    });

    it('throws on missing feed root element', async () => {
      const xml = '<?xml version="1.0"?><rss></rss>';

      await expect(parser.parse(xml, 'https://example.com/feed.xml')).rejects.toThrow(
        'missing <feed> root element'
      );
    });

    it('uses <updated> when <published> is missing', async () => {
      const xml = `<?xml version="1.0"?>
        <feed xmlns="http://www.w3.org/2005/Atom">
          <entry>
            <title>Test</title>
            <link href="https://example.com/article"/>
            <updated>2024-09-06T09:00:00Z</updated>
          </entry>
        </feed>`;

      const items = await parser.parse(xml, 'https://example.com/feed.xml');

      expect(items[0].publishedAt).toEqual(new Date('2024-09-06T09:00:00Z'));
    });

    it('prefers <published> over <updated>', async () => {
      const xml = `<?xml version="1.0"?>
        <feed xmlns="http://www.w3.org/2005/Atom">
          <entry>
            <title>Test</title>
            <link href="https://example.com/article"/>
            <published>2024-09-06T09:00:00Z</published>
            <updated>2024-09-07T10:00:00Z</updated>
          </entry>
        </feed>`;

      const items = await parser.parse(xml, 'https://example.com/feed.xml');

      expect(items[0].publishedAt).toEqual(new Date('2024-09-06T09:00:00Z'));
    });

    it('generates deterministic IDs', async () => {
      const xml = `<?xml version="1.0"?>
        <feed xmlns="http://www.w3.org/2005/Atom">
          <entry>
            <title>Test</title>
            <link href="https://example.com/article"/>
            <published>2024-09-06T09:00:00Z</published>
          </entry>
        </feed>`;

      const items1 = await parser.parse(xml, 'https://example.com/feed.xml');
      const items2 = await parser.parse(xml, 'https://example.com/feed.xml');

      expect(items1[0].id).toBe(items2[0].id);
    });

    it('handles multiple links in array format', async () => {
      const xml = `<?xml version="1.0"?>
        <feed xmlns="http://www.w3.org/2005/Atom">
          <entry>
            <title>Test</title>
            <link rel="self" href="https://example.com/feed"/>
            <link rel="alternate" href="https://example.com/article"/>
          </entry>
        </feed>`;

      const items = await parser.parse(xml, 'https://example.com/feed.xml');

      expect(items[0].url).toBe('https://example.com/article');
    });
  });

  describe('supports', () => {
    it('returns true for application/atom+xml', () => {
      expect(parser.supports('application/atom+xml')).toBe(true);
    });

    it('returns true for atom in content type', () => {
      expect(parser.supports('application/atom')).toBe(true);
    });

    it('returns false for rss content type', () => {
      expect(parser.supports('application/rss+xml')).toBe(false);
    });

    it('is case insensitive', () => {
      expect(parser.supports('APPLICATION/ATOM+XML')).toBe(true);
    });
  });
});