Extractors

2025-07-29 12:47:22 +02:00
parent 84960fe5fb
commit dcb0c3386b
4 changed files with 271 additions and 0 deletions
--- a/src/extractors/BaseNewspaperExtractor.ts
+++ b/src/extractors/BaseNewspaperExtractor.ts
@@ -0,0 +1,78 @@
+import { WebScraper } from '../utils/WebScraper';
+import { IFeed, NewsSource } from '../types/Feed';
+import { NewspaperConfig } from '../types/NewspaperTypes';
+import { Logger } from '../utils/logger';
+
+/**
+ * Clase abstracta base para extractores de periódicos
+ */
+export abstract class BaseNewspaperExtractor {
+  protected webScraper: WebScraper;
+  protected config: NewspaperConfig;
+
+  constructor(config: NewspaperConfig) {
+    this.webScraper = new WebScraper();
+    this.config = config;
+  }
+
+  /**
+   * Método abstracto que debe implementar cada extractor específico
+   */
+  abstract extractFrontPageUrls(): Promise<string[]>;
+
+  /**
+   * Extrae noticias de las URLs de portada
+   */
+  async extractNews(): Promise<Omit<IFeed, '_id' | 'createdAt' | 'updatedAt'>[]> {
+    try {
+      Logger.info(`Extracting front page URLs for ${this.config.name}`);
+      const urls = await this.extractFrontPageUrls();
+      
+      if (urls.length === 0) {
+        Logger.warn(`No URLs found for ${this.config.name}`);
+        return [];
+      }
+
+      Logger.info(`Found ${urls.length} articles for ${this.config.name}`);
+      const newsItems: Omit<IFeed, '_id' | 'createdAt' | 'updatedAt'>[] = [];
+
+      for (const url of urls) {
+        try {
+          const scrapedData = await this.webScraper.scrapeUrl(url);
+          if (scrapedData) {
+            const feedItem = this.webScraper.convertToFeedData(scrapedData, this.config.source);
+            newsItems.push(feedItem);
+          }
+        } catch (error) {
+          Logger.error(`Error scraping article ${url}:`, error);
+        }
+      }
+
+      return newsItems;
+    } catch (error) {
+      Logger.error(`Error extracting news for ${this.config.name}:`, error);
+      return [];
+    }
+  }
+
+  /**
+   * Verifica si el extractor está habilitado
+   */
+  isEnabled(): boolean {
+    return this.config.enabled;
+  }
+
+  /**
+   * Obtiene el nombre del periódico
+   */
+  getName(): string {
+    return this.config.name;
+  }
+
+  /**
+   * Obtiene la fuente del periódico
+   */
+  getSource(): NewsSource {
+    return this.config.source;
+  }
+}
--- a/src/extractors/ElMundoExtractor.ts
+++ b/src/extractors/ElMundoExtractor.ts
@@ -0,0 +1,78 @@
+import { BaseNewspaperExtractor } from './BaseNewspaperExtractor';
+import { NewsSource } from '../types/Feed';
+import { Logger } from '../utils/logger';
+
+/**
+ * Extractor específico para El Mundo
+ */
+export class ElMundoExtractor extends BaseNewspaperExtractor {
+  constructor() {
+    super({
+      name: 'El Mundo',
+      source: NewsSource.EL_MUNDO,
+      baseUrl: 'https://elmundo.es',
+      frontPageUrl: 'https://elmundo.es',
+      selectors: {
+        articleLinks: '.ue-c-cover-content__link, .ue-c-cover-content__headline-link, h2 a, h3 a',
+        titleSelector: 'h1, .ue-c-article__headline',
+        descriptionSelector: '.ue-c-article__standfirst, .ue-c-cover-content__standfirst',
+        dateSelector: '.ue-c-article__publishdate, time',
+        imageSelector: '.ue-c-article__image img'
+      },
+      enabled: true
+    });
+  }
+
+  async extractFrontPageUrls(): Promise<string[]> {
+    // Obtener HTML directamente usando fetch
+    const response = await fetch(this.config.frontPageUrl, {
+      headers: {
+        'User-Agent': 'Mozilla/5.0 (compatible; DailyTrends/1.0)',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
+      }
+    });
+
+    if (!response.ok) {
+      Logger.error(`Failed to fetch ${this.config.frontPageUrl}: ${response.status}`);
+      return [];
+    }
+
+    const html = await response.text();
+    if (!html) {
+      return [];
+    }
+
+    try {
+      // Extraer enlaces de artículos usando regex
+      const linkRegex = /<a[^>]+href=["']([^"']*(?:elmundo\.es)?[^"']*)["'][^>]*>.*?<\/a>/gi;
+      const urls: string[] = [];
+      let match;
+
+      while ((match = linkRegex.exec(html)) !== null) {
+        let url = match[1];
+        
+        // Filtrar solo URLs de artículos relevantes
+        if (url.includes('/espana/') || 
+            url.includes('/internacional/') || 
+            url.includes('/economia/') ||
+            url.includes('/sociedad/') ||
+            url.includes('/politica/')) {
+          
+          // Convertir URLs relativas a absolutas
+          if (url.startsWith('/')) {
+            url = this.config.baseUrl + url;
+          }
+          
+          if (!urls.includes(url) && urls.length < 20) {
+            urls.push(url);
+          }
+        }
+      }
+
+      return urls;
+    } catch (error) {
+      Logger.error(`Error extracting El Mundo URLs:`, error);
+      return [];
+    }
+  }
+}
--- a/src/extractors/ElPaisExtractor.ts
+++ b/src/extractors/ElPaisExtractor.ts
@@ -0,0 +1,78 @@
+import { BaseNewspaperExtractor } from './BaseNewspaperExtractor';
+import { NewsSource } from '../types/Feed';
+import { Logger } from '../utils/logger';
+
+/**
+ * Extractor específico para El País
+ */
+export class ElPaisExtractor extends BaseNewspaperExtractor {
+  constructor() {
+    super({
+      name: 'El País',
+      source: NewsSource.EL_PAIS,
+      baseUrl: 'https://elpais.com',
+      frontPageUrl: 'https://elpais.com',
+      selectors: {
+        articleLinks: 'article h2 a, .c_t a, .articulo-titulo a, h2.articulo-titulo a',
+        titleSelector: 'h1, .articulo-titulo',
+        descriptionSelector: '.articulo-entradilla, .entradilla, .subtitulo',
+        dateSelector: '.articulo-fecha, time',
+        imageSelector: '.articulo-foto img, .foto img'
+      },
+      enabled: true
+    });
+  }
+
+  async extractFrontPageUrls(): Promise<string[]> {
+    // Obtener HTML directamente usando fetch
+    const response = await fetch(this.config.frontPageUrl, {
+      headers: {
+        'User-Agent': 'Mozilla/5.0 (compatible; DailyTrends/1.0)',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
+      }
+    });
+
+    if (!response.ok) {
+      Logger.error(`Failed to fetch ${this.config.frontPageUrl}: ${response.status}`);
+      return [];
+    }
+
+    const html = await response.text();
+    if (!html) {
+      return [];
+    }
+
+    try {
+      // Extraer enlaces de artículos usando regex
+      const linkRegex = /<a[^>]+href=["']([^"']*(?:elpais\.com)?[^"']*)["'][^>]*>.*?<\/a>/gi;
+      const urls: string[] = [];
+      let match;
+
+      while ((match = linkRegex.exec(html)) !== null) {
+        let url = match[1];
+        
+        // Filtrar solo URLs de artículos relevantes
+        if (url.includes('/politica/') || 
+            url.includes('/economia/') || 
+            url.includes('/sociedad/') ||
+            url.includes('/internacional/') ||
+            url.includes('/espana/')) {
+          
+          // Convertir URLs relativas a absolutas
+          if (url.startsWith('/')) {
+            url = this.config.baseUrl + url;
+          }
+          
+          if (!urls.includes(url) && urls.length < 20) {
+            urls.push(url);
+          }
+        }
+      }
+
+      return urls;
+    } catch (error) {
+      Logger.error(`Error extracting El País URLs:`, error);
+      return [];
+    }
+  }
+}
--- a/src/extractors/NewspaperExtractorFactory.ts
+++ b/src/extractors/NewspaperExtractorFactory.ts
@@ -0,0 +1,37 @@
+import { BaseNewspaperExtractor } from './BaseNewspaperExtractor';
+import { ElPaisExtractor } from './ElPaisExtractor';
+import { ElMundoExtractor } from './ElMundoExtractor';
+import { NewsSource } from '../types/Feed';
+import { Logger } from '../utils/logger';
+
+/**
+ * Factory para crear extractores de periódicos
+ */
+export class NewspaperExtractorFactory {
+  static createExtractor(source: NewsSource): BaseNewspaperExtractor | null {
+    switch (source) {
+      case NewsSource.EL_PAIS:
+        return new ElPaisExtractor();
+      case NewsSource.EL_MUNDO:
+        return new ElMundoExtractor();
+      default:
+        Logger.warn(`No extractor available for source: ${source}`);
+        return null;
+    }
+  }
+
+  static getAllAvailableExtractors(): BaseNewspaperExtractor[] {
+    const extractors: BaseNewspaperExtractor[] = [];
+    
+    for (const source of Object.values(NewsSource)) {
+      if (source !== NewsSource.MANUAL) {
+        const extractor = this.createExtractor(source);
+        if (extractor) {
+          extractors.push(extractor);
+        }
+      }
+    }
+    
+    return extractors;
+  }
+}