pulse/modules/parser/atom.parser.ts

import type { FeedItem } from '../../interfaces/feed.types.js';
import type { IParser } from '../../interfaces/parser.interface.js';
import { XMLParser } from 'fast-xml-parser';
import { generateId, parseDate, isValidXml } from './utils.js';

interface AtomLink {
  '@_href'?: string;
  '@_rel'?: string;
}

interface AtomTextField {
  '#text'?: string;
  '@_type'?: string;
}

interface AtomEntry {
  title?: string | AtomTextField;
  link?: AtomLink[] | AtomLink;
  summary?: string | AtomTextField;
  content?: string | AtomTextField;
  published?: string;
  updated?: string;
  id?: string;
}

interface AtomFeed {
  feed?: {
    entry?: AtomEntry[] | AtomEntry;
  };
}

/**
 * Parser for Atom 1.0 feeds.
 */
export class AtomParser implements IParser {
  private xmlParser: XMLParser;

  constructor() {
    this.xmlParser = new XMLParser({
      ignoreAttributes: false,
      attributeNamePrefix: '@_',
      parseAttributeValue: false,
      trimValues: true,
    });
  }

  async parse(xml: string, source: string): Promise<FeedItem[]> {
    if (!isValidXml(xml)) {
      throw new Error('Invalid XML: does not appear to be valid Atom/XML');
    }

    let parsed: AtomFeed;
    try {
      parsed = this.xmlParser.parse(xml) as AtomFeed;
    } catch (error) {
      throw new Error(`XML parsing failed: ${(error as Error).message}`);
    }

    if (!parsed.feed) {
      throw new Error('Invalid Atom: missing <feed> root element');
    }

    const entries = parsed.feed.entry;
    if (!entries) {
      return [];
    }

    const entryArray = Array.isArray(entries) ? entries : [entries];

    return entryArray.map((entry) => this.parseEntry(entry, source));
  }

  private parseEntry(entry: AtomEntry, source: string): FeedItem {
    if (!entry.title) {
      throw new Error('Atom entry missing required field: title');
    }

    const url = this.extractUrl(entry);
    if (!url) {
      throw new Error('Atom entry missing required field: link with href');
    }

    const publishedAt = this.extractDate(entry);

    return {
      id: generateId(url, publishedAt),
      source,
      title: this.cleanText(entry.title),
      url,
      publishedAt,
      summary: entry.summary ? this.cleanText(entry.summary) : undefined,
      content: entry.content ? this.cleanText(entry.content) : undefined,
    };
  }

  private extractUrl(entry: AtomEntry): string | undefined {
    if (!entry.link) {
      return undefined;
    }

    const links = Array.isArray(entry.link) ? entry.link : [entry.link];

    // Prefer rel="alternate" (the actual article link)
    const alternate = links.find(l => l['@_rel'] === 'alternate');
    if (alternate?.['@_href']) {
      return alternate['@_href'];
    }

    // Fallback to first link with href, skip rel="self"
    const firstLink = links.find(l => l['@_href'] && l['@_rel'] !== 'self');
    if (firstLink?.['@_href']) {
      return firstLink['@_href'];
    }

    // Last resort: any href
    const anyLink = links.find(l => l['@_href']);
    return anyLink?.['@_href'];
  }

  private extractDate(entry: AtomEntry): Date {
    // Prefer <published>, fallback to <updated>
    if (entry.published) {
      return parseDate(entry.published);
    }
    if (entry.updated) {
      return parseDate(entry.updated);
    }
    return new Date();
  }

  private cleanText(text: string | AtomTextField | undefined): string {
    if (!text) return '';
    // Handle object with #text property (when element has attributes like type="html")
    const textValue = typeof text === 'string' ? text : text['#text'] || '';
    // Remove CDATA wrappers if present
    return textValue
      .replace(/^<!\[CDATA\[/, '')
      .replace(/\]\]>$/, '')
      .trim();
  }

  private extractTitle(entry: AtomEntry): string {
    if (!entry.title) {
      throw new Error('Atom entry missing required field: title');
    }
    return this.cleanText(entry.title);
  }

  supports(contentType: string): boolean {
    return contentType.toLowerCase().includes('atom');
  }
}