/** * Utility functions for the parser module. */ /** * Generates a deterministic hash from URL and published date. * Uses djb2 algorithm for fast, consistent hashing. */ export function generateId(url: string, publishedAt: Date): string { const str = `${url}::${publishedAt.toISOString()}`; let hash = 5381; for (let i = 0; i < str.length; i++) { hash = ((hash << 5) + hash) + str.charCodeAt(i); } return (hash >>> 0).toString(16); // Convert to unsigned and hex } /** * Parses dates from RSS (RFC 822) and Atom (ISO 8601) formats. * Tries multiple formats for robustness. * Throws if date cannot be parsed. */ export function parseDate(dateStr: string): Date { if (!dateStr || typeof dateStr !== 'string') { throw new Error(`Invalid date string: ${dateStr}`); } const trimmed = dateStr.trim(); // Try native Date parsing first (handles ISO 8601 and many common formats) let date = new Date(trimmed); if (!isNaN(date.getTime())) { return date; } // Try RFC 822 format: Mon, 06 Sep 2024 09:00:00 GMT const rfc822Match = trimmed.match(/^\w{3},?\s+(\d{1,2})\s+(\w{3})\s+(\d{4})\s+(\d{1,2}):(\d{2}):(\d{2})\s*(?:GMT|UTC|[+-]\d{4})?$/i); if (rfc822Match) { const months: { [key: string]: number } = { jan: 0, feb: 1, mar: 2, apr: 3, may: 4, jun: 5, jul: 6, aug: 7, sep: 8, oct: 9, nov: 10, dec: 11 }; const month = months[rfc822Match[2].toLowerCase()]; if (month !== undefined) { date = new Date( parseInt(rfc822Match[3]), month, parseInt(rfc822Match[1]), parseInt(rfc822Match[4]), parseInt(rfc822Match[5]), parseInt(rfc822Match[6]) ); if (!isNaN(date.getTime())) { return date; } } } throw new Error(`Unable to parse date: ${dateStr}`); } /** * Validates if a string is valid XML. * Basic check for XML declaration or root element. */ export function isValidXml(xml: string): boolean { if (!xml || typeof xml !== 'string') { return false; } const trimmed = xml.trim(); return trimmed.startsWith('