pulse/modules/parser/rss.parser.ts

import type { FeedItem } from '../../interfaces/feed.types.js';
import type { IParser } from '../../interfaces/parser.interface.js';
import { XMLParser } from 'fast-xml-parser';
import { generateId, parseDate, isValidXml } from './utils.js';

interface RssChannel {
  title?: string;
  link?: string;
  description?: string;
}

interface RssEnclosure {
  '@_url'?: string;
  '@_type'?: string;
  '@_length'?: string;
}

interface RssItem {
  title?: string;
  link?: string;
  description?: string;
  'content:encoded'?: string;
  pubDate?: string;
  guid?: string;
  enclosure?: RssEnclosure | RssEnclosure[];
}

interface RssFeed {
  rss?: {
    channel?: {
      item?: RssItem[] | RssItem;
    };
  };
}

/**
 * Parser for RSS 2.0 feeds.
 */
export class RssParser implements IParser {
  private xmlParser: XMLParser;

  constructor() {
    this.xmlParser = new XMLParser({
      ignoreAttributes: false,
      attributeNamePrefix: '@_',
      parseAttributeValue: false,
      trimValues: true,
    });
  }

  async parse(xml: string, source: string): Promise<FeedItem[]> {
    if (!isValidXml(xml)) {
      throw new Error('Invalid XML: does not appear to be valid RSS/XML');
    }

    let parsed: RssFeed;
    try {
      parsed = this.xmlParser.parse(xml) as RssFeed;
    } catch (error) {
      throw new Error(`XML parsing failed: ${(error as Error).message}`);
    }

    if (!parsed.rss) {
      throw new Error('Invalid RSS: missing <rss> root element');
    }

    const channel = parsed.rss.channel;
    if (!channel) {
      throw new Error('Invalid RSS: missing <channel> element');
    }

    const items = channel.item;
    if (!items) {
      return [];
    }

    const itemArray = Array.isArray(items) ? items : [items];

    return itemArray.map((item) => this.parseItem(item, source));
  }

  private parseItem(item: RssItem, source: string): FeedItem {
    if (!item.title) {
      throw new Error('RSS item missing required field: title');
    }

    if (!item.link) {
      throw new Error('RSS item missing required field: link');
    }

    const publishedAt = item.pubDate ? parseDate(item.pubDate) : new Date();
    const url = item.link;

    // Handle content extraction
    // Some feeds use content:encoded for full content
    // Others put full content in description (like NOS)
    const hasContentEncoded = !!item['content:encoded'];
    const description = item.description ? this.cleanText(item.description) : undefined;
    const contentEncoded = item['content:encoded'] ? this.cleanText(item['content:encoded']) : undefined;

    // If there's content:encoded, use it as content and description as summary
    // If no content:encoded but description looks like full content (contains HTML), use it as content
    // Otherwise description is just a summary
    const descriptionLooksLikeContent = description && this.looksLikeFullContent(description);

    let content: string | undefined;
    let summary: string | undefined;

    if (contentEncoded) {
      // Standard case: content:encoded has full content, description has summary
      content = contentEncoded;
      summary = description;
    } else if (descriptionLooksLikeContent) {
      // No content:encoded but description contains full HTML content
      content = description;
      // Extract first paragraph or truncate for summary
      summary = this.extractSummary(description);
    } else {
      // Description is just a plain text summary
      summary = description;
      content = undefined;
    }

    // Extract image URL from enclosure
    const imageUrl = this.extractImageUrl(item.enclosure);

    return {
      id: generateId(url, publishedAt),
      source,
      title: this.cleanText(item.title),
      url,
      publishedAt,
      summary,
      content,
      imageUrl,
    };
  }

  private extractImageUrl(enclosure: RssEnclosure | RssEnclosure[] | undefined): string | undefined {
    if (!enclosure) {
      return undefined;
    }

    // Handle single enclosure or array of enclosures
    const enclosures = Array.isArray(enclosure) ? enclosure : [enclosure];

    // Find first image enclosure
    for (const enc of enclosures) {
      const url = enc['@_url'];
      const type = enc['@_type'];

      if (url) {
        // Check if it's an image type (image/*)
        if (!type || type.startsWith('image/')) {
          return url;
        }
      }
    }

    return undefined;
  }

  private looksLikeFullContent(text: string): boolean {
    // Check if text looks like full HTML content rather than a brief summary
    // Indicators: contains HTML tags, is quite long, has multiple paragraphs
    if (!text) return false;

    // Check for common HTML tags that indicate full content
    const hasHtmlTags = /<(p|div|br|h[1-6]|ul|ol|li|img|a|strong|em|blockquote)[\s>]/i.test(text);

    // Check for substantial length (more than 500 chars suggests full content)
    const isLong = text.length > 500;

    // Check for multiple paragraphs or line breaks
    const hasMultipleParagraphs = (text.match(/<p[\s>]/gi) || []).length >= 2 ||
                                   text.split(/\n\n|<br\s*\/?>/i).length >= 3;

    return hasHtmlTags && (isLong || hasMultipleParagraphs);
  }

  private extractSummary(content: string): string {
    // Extract a summary from full HTML content
    // Try to get the first paragraph or first 200 chars
    if (!content) return '';

    // Remove HTML tags
    const textOnly = content.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim();

    // Get first 200 characters, ending at a word boundary
    if (textOnly.length <= 200) {
      return textOnly;
    }

    const truncated = textOnly.substring(0, 200);
    const lastSpace = truncated.lastIndexOf(' ');
    return truncated.substring(0, lastSpace) + '...';
  }

  private cleanText(text: string): string {
    if (!text) return '';
    // Remove CDATA wrappers if present
    return text
      .replace(/^<!\[CDATA\[/, '')
      .replace(/\]\]>$/, '')
      .trim();
  }

  supports(contentType: string): boolean {
    return contentType.toLowerCase().includes('rss');
  }
}