diff --git a/src/extractors/BaseNewspaperExtractor.ts b/src/extractors/BaseNewspaperExtractor.ts new file mode 100644 index 0000000..80f0fea --- /dev/null +++ b/src/extractors/BaseNewspaperExtractor.ts @@ -0,0 +1,78 @@ +import { WebScraper } from '../utils/WebScraper'; +import { IFeed, NewsSource } from '../types/Feed'; +import { NewspaperConfig } from '../types/NewspaperTypes'; +import { Logger } from '../utils/logger'; + +/** + * Clase abstracta base para extractores de periódicos + */ +export abstract class BaseNewspaperExtractor { + protected webScraper: WebScraper; + protected config: NewspaperConfig; + + constructor(config: NewspaperConfig) { + this.webScraper = new WebScraper(); + this.config = config; + } + + /** + * Método abstracto que debe implementar cada extractor específico + */ + abstract extractFrontPageUrls(): Promise; + + /** + * Extrae noticias de las URLs de portada + */ + async extractNews(): Promise[]> { + try { + Logger.info(`Extracting front page URLs for ${this.config.name}`); + const urls = await this.extractFrontPageUrls(); + + if (urls.length === 0) { + Logger.warn(`No URLs found for ${this.config.name}`); + return []; + } + + Logger.info(`Found ${urls.length} articles for ${this.config.name}`); + const newsItems: Omit[] = []; + + for (const url of urls) { + try { + const scrapedData = await this.webScraper.scrapeUrl(url); + if (scrapedData) { + const feedItem = this.webScraper.convertToFeedData(scrapedData, this.config.source); + newsItems.push(feedItem); + } + } catch (error) { + Logger.error(`Error scraping article ${url}:`, error); + } + } + + return newsItems; + } catch (error) { + Logger.error(`Error extracting news for ${this.config.name}:`, error); + return []; + } + } + + /** + * Verifica si el extractor está habilitado + */ + isEnabled(): boolean { + return this.config.enabled; + } + + /** + * Obtiene el nombre del periódico + */ + getName(): string { + return this.config.name; + } + + /** + * Obtiene la fuente del periódico + */ + getSource(): NewsSource { + return this.config.source; + } +} \ No newline at end of file diff --git a/src/extractors/ElMundoExtractor.ts b/src/extractors/ElMundoExtractor.ts new file mode 100644 index 0000000..12a324f --- /dev/null +++ b/src/extractors/ElMundoExtractor.ts @@ -0,0 +1,78 @@ +import { BaseNewspaperExtractor } from './BaseNewspaperExtractor'; +import { NewsSource } from '../types/Feed'; +import { Logger } from '../utils/logger'; + +/** + * Extractor específico para El Mundo + */ +export class ElMundoExtractor extends BaseNewspaperExtractor { + constructor() { + super({ + name: 'El Mundo', + source: NewsSource.EL_MUNDO, + baseUrl: 'https://elmundo.es', + frontPageUrl: 'https://elmundo.es', + selectors: { + articleLinks: '.ue-c-cover-content__link, .ue-c-cover-content__headline-link, h2 a, h3 a', + titleSelector: 'h1, .ue-c-article__headline', + descriptionSelector: '.ue-c-article__standfirst, .ue-c-cover-content__standfirst', + dateSelector: '.ue-c-article__publishdate, time', + imageSelector: '.ue-c-article__image img' + }, + enabled: true + }); + } + + async extractFrontPageUrls(): Promise { + // Obtener HTML directamente usando fetch + const response = await fetch(this.config.frontPageUrl, { + headers: { + 'User-Agent': 'Mozilla/5.0 (compatible; DailyTrends/1.0)', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' + } + }); + + if (!response.ok) { + Logger.error(`Failed to fetch ${this.config.frontPageUrl}: ${response.status}`); + return []; + } + + const html = await response.text(); + if (!html) { + return []; + } + + try { + // Extraer enlaces de artículos usando regex + const linkRegex = /]+href=["']([^"']*(?:elmundo\.es)?[^"']*)["'][^>]*>.*?<\/a>/gi; + const urls: string[] = []; + let match; + + while ((match = linkRegex.exec(html)) !== null) { + let url = match[1]; + + // Filtrar solo URLs de artículos relevantes + if (url.includes('/espana/') || + url.includes('/internacional/') || + url.includes('/economia/') || + url.includes('/sociedad/') || + url.includes('/politica/')) { + + // Convertir URLs relativas a absolutas + if (url.startsWith('/')) { + url = this.config.baseUrl + url; + } + + if (!urls.includes(url) && urls.length < 20) { + urls.push(url); + } + } + } + + return urls; + } catch (error) { + Logger.error(`Error extracting El Mundo URLs:`, error); + return []; + } + } +} \ No newline at end of file diff --git a/src/extractors/ElPaisExtractor.ts b/src/extractors/ElPaisExtractor.ts new file mode 100644 index 0000000..968bb99 --- /dev/null +++ b/src/extractors/ElPaisExtractor.ts @@ -0,0 +1,78 @@ +import { BaseNewspaperExtractor } from './BaseNewspaperExtractor'; +import { NewsSource } from '../types/Feed'; +import { Logger } from '../utils/logger'; + +/** + * Extractor específico para El País + */ +export class ElPaisExtractor extends BaseNewspaperExtractor { + constructor() { + super({ + name: 'El País', + source: NewsSource.EL_PAIS, + baseUrl: 'https://elpais.com', + frontPageUrl: 'https://elpais.com', + selectors: { + articleLinks: 'article h2 a, .c_t a, .articulo-titulo a, h2.articulo-titulo a', + titleSelector: 'h1, .articulo-titulo', + descriptionSelector: '.articulo-entradilla, .entradilla, .subtitulo', + dateSelector: '.articulo-fecha, time', + imageSelector: '.articulo-foto img, .foto img' + }, + enabled: true + }); + } + + async extractFrontPageUrls(): Promise { + // Obtener HTML directamente usando fetch + const response = await fetch(this.config.frontPageUrl, { + headers: { + 'User-Agent': 'Mozilla/5.0 (compatible; DailyTrends/1.0)', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' + } + }); + + if (!response.ok) { + Logger.error(`Failed to fetch ${this.config.frontPageUrl}: ${response.status}`); + return []; + } + + const html = await response.text(); + if (!html) { + return []; + } + + try { + // Extraer enlaces de artículos usando regex + const linkRegex = /]+href=["']([^"']*(?:elpais\.com)?[^"']*)["'][^>]*>.*?<\/a>/gi; + const urls: string[] = []; + let match; + + while ((match = linkRegex.exec(html)) !== null) { + let url = match[1]; + + // Filtrar solo URLs de artículos relevantes + if (url.includes('/politica/') || + url.includes('/economia/') || + url.includes('/sociedad/') || + url.includes('/internacional/') || + url.includes('/espana/')) { + + // Convertir URLs relativas a absolutas + if (url.startsWith('/')) { + url = this.config.baseUrl + url; + } + + if (!urls.includes(url) && urls.length < 20) { + urls.push(url); + } + } + } + + return urls; + } catch (error) { + Logger.error(`Error extracting El País URLs:`, error); + return []; + } + } +} \ No newline at end of file diff --git a/src/extractors/NewspaperExtractorFactory.ts b/src/extractors/NewspaperExtractorFactory.ts new file mode 100644 index 0000000..47a0d9c --- /dev/null +++ b/src/extractors/NewspaperExtractorFactory.ts @@ -0,0 +1,37 @@ +import { BaseNewspaperExtractor } from './BaseNewspaperExtractor'; +import { ElPaisExtractor } from './ElPaisExtractor'; +import { ElMundoExtractor } from './ElMundoExtractor'; +import { NewsSource } from '../types/Feed'; +import { Logger } from '../utils/logger'; + +/** + * Factory para crear extractores de periódicos + */ +export class NewspaperExtractorFactory { + static createExtractor(source: NewsSource): BaseNewspaperExtractor | null { + switch (source) { + case NewsSource.EL_PAIS: + return new ElPaisExtractor(); + case NewsSource.EL_MUNDO: + return new ElMundoExtractor(); + default: + Logger.warn(`No extractor available for source: ${source}`); + return null; + } + } + + static getAllAvailableExtractors(): BaseNewspaperExtractor[] { + const extractors: BaseNewspaperExtractor[] = []; + + for (const source of Object.values(NewsSource)) { + if (source !== NewsSource.MANUAL) { + const extractor = this.createExtractor(source); + if (extractor) { + extractors.push(extractor); + } + } + } + + return extractors; + } +} \ No newline at end of file