Extractors

This commit is contained in:
albert
2025-07-29 12:47:22 +02:00
parent 84960fe5fb
commit dcb0c3386b
4 changed files with 271 additions and 0 deletions

View File

@ -0,0 +1,78 @@
import { WebScraper } from '../utils/WebScraper';
import { IFeed, NewsSource } from '../types/Feed';
import { NewspaperConfig } from '../types/NewspaperTypes';
import { Logger } from '../utils/logger';
/**
* Clase abstracta base para extractores de periódicos
*/
export abstract class BaseNewspaperExtractor {
protected webScraper: WebScraper;
protected config: NewspaperConfig;
constructor(config: NewspaperConfig) {
this.webScraper = new WebScraper();
this.config = config;
}
/**
* Método abstracto que debe implementar cada extractor específico
*/
abstract extractFrontPageUrls(): Promise<string[]>;
/**
* Extrae noticias de las URLs de portada
*/
async extractNews(): Promise<Omit<IFeed, '_id' | 'createdAt' | 'updatedAt'>[]> {
try {
Logger.info(`Extracting front page URLs for ${this.config.name}`);
const urls = await this.extractFrontPageUrls();
if (urls.length === 0) {
Logger.warn(`No URLs found for ${this.config.name}`);
return [];
}
Logger.info(`Found ${urls.length} articles for ${this.config.name}`);
const newsItems: Omit<IFeed, '_id' | 'createdAt' | 'updatedAt'>[] = [];
for (const url of urls) {
try {
const scrapedData = await this.webScraper.scrapeUrl(url);
if (scrapedData) {
const feedItem = this.webScraper.convertToFeedData(scrapedData, this.config.source);
newsItems.push(feedItem);
}
} catch (error) {
Logger.error(`Error scraping article ${url}:`, error);
}
}
return newsItems;
} catch (error) {
Logger.error(`Error extracting news for ${this.config.name}:`, error);
return [];
}
}
/**
* Verifica si el extractor está habilitado
*/
isEnabled(): boolean {
return this.config.enabled;
}
/**
* Obtiene el nombre del periódico
*/
getName(): string {
return this.config.name;
}
/**
* Obtiene la fuente del periódico
*/
getSource(): NewsSource {
return this.config.source;
}
}

View File

@ -0,0 +1,78 @@
import { BaseNewspaperExtractor } from './BaseNewspaperExtractor';
import { NewsSource } from '../types/Feed';
import { Logger } from '../utils/logger';
/**
* Extractor específico para El Mundo
*/
export class ElMundoExtractor extends BaseNewspaperExtractor {
constructor() {
super({
name: 'El Mundo',
source: NewsSource.EL_MUNDO,
baseUrl: 'https://elmundo.es',
frontPageUrl: 'https://elmundo.es',
selectors: {
articleLinks: '.ue-c-cover-content__link, .ue-c-cover-content__headline-link, h2 a, h3 a',
titleSelector: 'h1, .ue-c-article__headline',
descriptionSelector: '.ue-c-article__standfirst, .ue-c-cover-content__standfirst',
dateSelector: '.ue-c-article__publishdate, time',
imageSelector: '.ue-c-article__image img'
},
enabled: true
});
}
async extractFrontPageUrls(): Promise<string[]> {
// Obtener HTML directamente usando fetch
const response = await fetch(this.config.frontPageUrl, {
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; DailyTrends/1.0)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
}
});
if (!response.ok) {
Logger.error(`Failed to fetch ${this.config.frontPageUrl}: ${response.status}`);
return [];
}
const html = await response.text();
if (!html) {
return [];
}
try {
// Extraer enlaces de artículos usando regex
const linkRegex = /<a[^>]+href=["']([^"']*(?:elmundo\.es)?[^"']*)["'][^>]*>.*?<\/a>/gi;
const urls: string[] = [];
let match;
while ((match = linkRegex.exec(html)) !== null) {
let url = match[1];
// Filtrar solo URLs de artículos relevantes
if (url.includes('/espana/') ||
url.includes('/internacional/') ||
url.includes('/economia/') ||
url.includes('/sociedad/') ||
url.includes('/politica/')) {
// Convertir URLs relativas a absolutas
if (url.startsWith('/')) {
url = this.config.baseUrl + url;
}
if (!urls.includes(url) && urls.length < 20) {
urls.push(url);
}
}
}
return urls;
} catch (error) {
Logger.error(`Error extracting El Mundo URLs:`, error);
return [];
}
}
}

View File

@ -0,0 +1,78 @@
import { BaseNewspaperExtractor } from './BaseNewspaperExtractor';
import { NewsSource } from '../types/Feed';
import { Logger } from '../utils/logger';
/**
* Extractor específico para El País
*/
export class ElPaisExtractor extends BaseNewspaperExtractor {
constructor() {
super({
name: 'El País',
source: NewsSource.EL_PAIS,
baseUrl: 'https://elpais.com',
frontPageUrl: 'https://elpais.com',
selectors: {
articleLinks: 'article h2 a, .c_t a, .articulo-titulo a, h2.articulo-titulo a',
titleSelector: 'h1, .articulo-titulo',
descriptionSelector: '.articulo-entradilla, .entradilla, .subtitulo',
dateSelector: '.articulo-fecha, time',
imageSelector: '.articulo-foto img, .foto img'
},
enabled: true
});
}
async extractFrontPageUrls(): Promise<string[]> {
// Obtener HTML directamente usando fetch
const response = await fetch(this.config.frontPageUrl, {
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; DailyTrends/1.0)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
}
});
if (!response.ok) {
Logger.error(`Failed to fetch ${this.config.frontPageUrl}: ${response.status}`);
return [];
}
const html = await response.text();
if (!html) {
return [];
}
try {
// Extraer enlaces de artículos usando regex
const linkRegex = /<a[^>]+href=["']([^"']*(?:elpais\.com)?[^"']*)["'][^>]*>.*?<\/a>/gi;
const urls: string[] = [];
let match;
while ((match = linkRegex.exec(html)) !== null) {
let url = match[1];
// Filtrar solo URLs de artículos relevantes
if (url.includes('/politica/') ||
url.includes('/economia/') ||
url.includes('/sociedad/') ||
url.includes('/internacional/') ||
url.includes('/espana/')) {
// Convertir URLs relativas a absolutas
if (url.startsWith('/')) {
url = this.config.baseUrl + url;
}
if (!urls.includes(url) && urls.length < 20) {
urls.push(url);
}
}
}
return urls;
} catch (error) {
Logger.error(`Error extracting El País URLs:`, error);
return [];
}
}
}

View File

@ -0,0 +1,37 @@
import { BaseNewspaperExtractor } from './BaseNewspaperExtractor';
import { ElPaisExtractor } from './ElPaisExtractor';
import { ElMundoExtractor } from './ElMundoExtractor';
import { NewsSource } from '../types/Feed';
import { Logger } from '../utils/logger';
/**
* Factory para crear extractores de periódicos
*/
export class NewspaperExtractorFactory {
static createExtractor(source: NewsSource): BaseNewspaperExtractor | null {
switch (source) {
case NewsSource.EL_PAIS:
return new ElPaisExtractor();
case NewsSource.EL_MUNDO:
return new ElMundoExtractor();
default:
Logger.warn(`No extractor available for source: ${source}`);
return null;
}
}
static getAllAvailableExtractors(): BaseNewspaperExtractor[] {
const extractors: BaseNewspaperExtractor[] = [];
for (const source of Object.values(NewsSource)) {
if (source !== NewsSource.MANUAL) {
const extractor = this.createExtractor(source);
if (extractor) {
extractors.push(extractor);
}
}
}
return extractors;
}
}