Edo Limburg 2ea1f5cd95 Add parser module with RSS and Atom feed parsers
- Create separate RssParser and AtomParser implementing IParser interface
- Add utility functions for ID generation (djb2 hash) and date parsing
- Support both RSS (RFC 822) and Atom (ISO 8601) date formats
- Handle Atom elements with attributes (type="html") via #text property
- Map RSS <description> to summary and <content:encoded> to content
- Map Atom <summary> to summary and <content> to content
- Prefer Atom link[@rel="alternate"] for article URLs
- Throw descriptive errors for malformed XML and missing required fields
- Add comprehensive test coverage for both parsers (32 tests total)
2026-05-05 21:15:20 +02:00

76 lines
2.2 KiB
TypeScript

/**
* Utility functions for the parser module.
*/
/**
* Generates a deterministic hash from URL and published date.
* Uses djb2 algorithm for fast, consistent hashing.
*/
export function generateId(url: string, publishedAt: Date): string {
const str = `${url}::${publishedAt.toISOString()}`;
let hash = 5381;
for (let i = 0; i < str.length; i++) {
hash = ((hash << 5) + hash) + str.charCodeAt(i);
}
return (hash >>> 0).toString(16); // Convert to unsigned and hex
}
/**
* Parses dates from RSS (RFC 822) and Atom (ISO 8601) formats.
* Tries multiple formats for robustness.
* Throws if date cannot be parsed.
*/
export function parseDate(dateStr: string): Date {
if (!dateStr || typeof dateStr !== 'string') {
throw new Error(`Invalid date string: ${dateStr}`);
}
const trimmed = dateStr.trim();
// Try native Date parsing first (handles ISO 8601 and many common formats)
let date = new Date(trimmed);
if (!isNaN(date.getTime())) {
return date;
}
// Try RFC 822 format: Mon, 06 Sep 2024 09:00:00 GMT
const rfc822Match = trimmed.match(/^\w{3},?\s+(\d{1,2})\s+(\w{3})\s+(\d{4})\s+(\d{1,2}):(\d{2}):(\d{2})\s*(?:GMT|UTC|[+-]\d{4})?$/i);
if (rfc822Match) {
const months: { [key: string]: number } = {
jan: 0, feb: 1, mar: 2, apr: 3, may: 4, jun: 5,
jul: 6, aug: 7, sep: 8, oct: 9, nov: 10, dec: 11
};
const month = months[rfc822Match[2].toLowerCase()];
if (month !== undefined) {
date = new Date(
parseInt(rfc822Match[3]),
month,
parseInt(rfc822Match[1]),
parseInt(rfc822Match[4]),
parseInt(rfc822Match[5]),
parseInt(rfc822Match[6])
);
if (!isNaN(date.getTime())) {
return date;
}
}
}
throw new Error(`Unable to parse date: ${dateStr}`);
}
/**
* Validates if a string is valid XML.
* Basic check for XML declaration or root element.
*/
export function isValidXml(xml: string): boolean {
if (!xml || typeof xml !== 'string') {
return false;
}
const trimmed = xml.trim();
return trimmed.startsWith('<?xml') ||
trimmed.startsWith('<rss') ||
trimmed.startsWith('<feed') ||
trimmed.startsWith('<channel');
}