- Create separate RssParser and AtomParser implementing IParser interface - Add utility functions for ID generation (djb2 hash) and date parsing - Support both RSS (RFC 822) and Atom (ISO 8601) date formats - Handle Atom elements with attributes (type="html") via #text property - Map RSS <description> to summary and <content:encoded> to content - Map Atom <summary> to summary and <content> to content - Prefer Atom link[@rel="alternate"] for article URLs - Throw descriptive errors for malformed XML and missing required fields - Add comprehensive test coverage for both parsers (32 tests total)
76 lines
2.2 KiB
TypeScript
76 lines
2.2 KiB
TypeScript
/**
|
|
* Utility functions for the parser module.
|
|
*/
|
|
|
|
/**
|
|
* Generates a deterministic hash from URL and published date.
|
|
* Uses djb2 algorithm for fast, consistent hashing.
|
|
*/
|
|
export function generateId(url: string, publishedAt: Date): string {
|
|
const str = `${url}::${publishedAt.toISOString()}`;
|
|
let hash = 5381;
|
|
for (let i = 0; i < str.length; i++) {
|
|
hash = ((hash << 5) + hash) + str.charCodeAt(i);
|
|
}
|
|
return (hash >>> 0).toString(16); // Convert to unsigned and hex
|
|
}
|
|
|
|
/**
|
|
* Parses dates from RSS (RFC 822) and Atom (ISO 8601) formats.
|
|
* Tries multiple formats for robustness.
|
|
* Throws if date cannot be parsed.
|
|
*/
|
|
export function parseDate(dateStr: string): Date {
|
|
if (!dateStr || typeof dateStr !== 'string') {
|
|
throw new Error(`Invalid date string: ${dateStr}`);
|
|
}
|
|
|
|
const trimmed = dateStr.trim();
|
|
|
|
// Try native Date parsing first (handles ISO 8601 and many common formats)
|
|
let date = new Date(trimmed);
|
|
if (!isNaN(date.getTime())) {
|
|
return date;
|
|
}
|
|
|
|
// Try RFC 822 format: Mon, 06 Sep 2024 09:00:00 GMT
|
|
const rfc822Match = trimmed.match(/^\w{3},?\s+(\d{1,2})\s+(\w{3})\s+(\d{4})\s+(\d{1,2}):(\d{2}):(\d{2})\s*(?:GMT|UTC|[+-]\d{4})?$/i);
|
|
if (rfc822Match) {
|
|
const months: { [key: string]: number } = {
|
|
jan: 0, feb: 1, mar: 2, apr: 3, may: 4, jun: 5,
|
|
jul: 6, aug: 7, sep: 8, oct: 9, nov: 10, dec: 11
|
|
};
|
|
const month = months[rfc822Match[2].toLowerCase()];
|
|
if (month !== undefined) {
|
|
date = new Date(
|
|
parseInt(rfc822Match[3]),
|
|
month,
|
|
parseInt(rfc822Match[1]),
|
|
parseInt(rfc822Match[4]),
|
|
parseInt(rfc822Match[5]),
|
|
parseInt(rfc822Match[6])
|
|
);
|
|
if (!isNaN(date.getTime())) {
|
|
return date;
|
|
}
|
|
}
|
|
}
|
|
|
|
throw new Error(`Unable to parse date: ${dateStr}`);
|
|
}
|
|
|
|
/**
|
|
* Validates if a string is valid XML.
|
|
* Basic check for XML declaration or root element.
|
|
*/
|
|
export function isValidXml(xml: string): boolean {
|
|
if (!xml || typeof xml !== 'string') {
|
|
return false;
|
|
}
|
|
const trimmed = xml.trim();
|
|
return trimmed.startsWith('<?xml') ||
|
|
trimmed.startsWith('<rss') ||
|
|
trimmed.startsWith('<feed') ||
|
|
trimmed.startsWith('<channel');
|
|
}
|