Compare commits
	
		
			10 Commits
		
	
	
		
			074f66ac2b
			...
			febcc60605
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| febcc60605 | |||
| 
						 | 
					4e36c2217a | ||
| 
						 | 
					dcb0c3386b | ||
| 
						 | 
					84960fe5fb | ||
| 
						 | 
					ced2254068 | ||
| 
						 | 
					d8381c893d | ||
| 
						 | 
					891b1e478d | ||
| 
						 | 
					d35416b5c8 | ||
| 
						 | 
					36f6de4edd | ||
| 
						 | 
					e0cb439234 | 
							
								
								
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@@ -1,3 +1,4 @@
 | 
				
			|||||||
node_modules
 | 
					node_modules
 | 
				
			||||||
dist
 | 
					dist
 | 
				
			||||||
*.bk
 | 
					*.bk
 | 
				
			||||||
 | 
					.DS_Store
 | 
				
			||||||
							
								
								
									
										41
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										41
									
								
								README.md
									
									
									
									
									
								
							@@ -117,3 +117,44 @@ EXPOSE 3000
 | 
				
			|||||||
CMD ["node", "dist/index.js"]
 | 
					CMD ["node", "dist/index.js"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					###  Scraper OOP 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#### Entrypoint
 | 
				
			||||||
 | 
					- `scraper.ts`                     - Application entry point that initializes the scraping system
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#### Core Services
 | 
				
			||||||
 | 
					- `ScrapingScheduler.ts`           - Orchestrates scraping cycles and timing
 | 
				
			||||||
 | 
					- `ContentScrapingService.ts`      - Handles web content scraping logic
 | 
				
			||||||
 | 
					- `FeedReaderService.ts`           - Manages newspaper extraction
 | 
				
			||||||
 | 
					- `ScrapingService.ts`             - Base scraping functionality
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#### Utilities
 | 
				
			||||||
 | 
					- `WebScraper.ts`                  - HTML parsing and data extraction utility
 | 
				
			||||||
 | 
					- `logger.ts`                      - Logging utility
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#### Extractors
 | 
				
			||||||
 | 
					- `BaseNewspaperExtractor.ts`      - clase Abstract Base
 | 
				
			||||||
 | 
					- `ElPaisExtractor.ts`             - especificación / extractor para El País 
 | 
				
			||||||
 | 
					- `ElMundoExtractor.ts`            - especificación / extractor para El Mundo
 | 
				
			||||||
 | 
					- `NewspaperExtractorFactory.ts`   - clase Factory  para crear extractors
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#### Types & Interfaces
 | 
				
			||||||
 | 
					- `Feed.ts`                        - tipos y interfaces
 | 
				
			||||||
 | 
					- `NewspaperTypes.ts`              - configuración de las interfaces
 | 
				
			||||||
 | 
					- `FeedRepository.ts`              - abstracción interfaz de la base de datos
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Propiedades de OOP
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- He intentado seguir las propiedades de OOP. Ejemplo:
 | 
				
			||||||
 | 
					  - separación de responsabilidades: con las capas de abstracción, y servicios dedicados
 | 
				
			||||||
 | 
					  - Factory de los extractors en NewspaperExtractorFactory, básicamente, patrón de diseño que nos ayuda a crear objetos de una clase específica, basados en ciertos parámetros, y así lo adaptamos a nuestros periodicos favoritos.
 | 
				
			||||||
 | 
					  - Herencia, desde BaseNewspaperExtractor a los extractors.
 | 
				
			||||||
 | 
					  - Utils, para tener DRY y poder usarlo desde diferentes classes.
 | 
				
			||||||
 | 
					  - He intentando poner tests donde sea necesario, y de forma que tenga sentido.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Obviamente cualquier propuesta está siempre abierta a debate y a mejoras. 
 | 
				
			||||||
 | 
					En mi caso, y dentro de las limitaciones, he intentado seguir las instrucciones y ver como lo podemos adaptar. 
 | 
				
			||||||
 | 
					Seguramente con más tiempo se puede simplificar más sin perder funcionalidades.
 | 
				
			||||||
@@ -18,6 +18,8 @@
 | 
				
			|||||||
    "build": "tsc",
 | 
					    "build": "tsc",
 | 
				
			||||||
    "start": "node dist/server.js",
 | 
					    "start": "node dist/server.js",
 | 
				
			||||||
    "dev": "tsx watch src/server.ts",
 | 
					    "dev": "tsx watch src/server.ts",
 | 
				
			||||||
 | 
					    "scraper": "node dist/scraper.js",
 | 
				
			||||||
 | 
					    "scraper:dev": "tsx watch src/scraper.ts",
 | 
				
			||||||
    "test": "jest",
 | 
					    "test": "jest",
 | 
				
			||||||
    "test:watch": "jest --watch",
 | 
					    "test:watch": "jest --watch",
 | 
				
			||||||
    "lint": "eslint src/**/*.ts",
 | 
					    "lint": "eslint src/**/*.ts",
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										259
									
								
								src/__tests__/ContentScrapingService.test.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										259
									
								
								src/__tests__/ContentScrapingService.test.ts
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,259 @@
 | 
				
			|||||||
 | 
					import { ContentScrapingService } from '../services/ContentScrapingService';
 | 
				
			||||||
 | 
					import { WebScraper } from '../utils/WebScraper';
 | 
				
			||||||
 | 
					import { ScrapingService } from '../services/ScrapingService';
 | 
				
			||||||
 | 
					import { IFeedRepository } from '../repositories/FeedRepository';
 | 
				
			||||||
 | 
					import { NewsSource } from '../types/Feed';
 | 
				
			||||||
 | 
					import { Logger } from '../utils/logger';
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Mock dependencies
 | 
				
			||||||
 | 
					jest.mock('../utils/WebScraper');
 | 
				
			||||||
 | 
					jest.mock('../services/ScrapingService');
 | 
				
			||||||
 | 
					jest.mock('../utils/logger');
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					describe('ContentScrapingService', () => {
 | 
				
			||||||
 | 
					  let contentScrapingService: ContentScrapingService;
 | 
				
			||||||
 | 
					  let mockFeedRepository: jest.Mocked<IFeedRepository>;
 | 
				
			||||||
 | 
					  let mockWebScraper: jest.Mocked<WebScraper>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  let mockScrapingService: jest.Mocked<ScrapingService>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  beforeEach(() => {
 | 
				
			||||||
 | 
					    jest.clearAllMocks();
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    mockFeedRepository = {
 | 
				
			||||||
 | 
					      create: jest.fn(),
 | 
				
			||||||
 | 
					      findAll: jest.fn(),
 | 
				
			||||||
 | 
					      findById: jest.fn(),
 | 
				
			||||||
 | 
					      findByUrl: jest.fn(),
 | 
				
			||||||
 | 
					      findBySource: jest.fn(),
 | 
				
			||||||
 | 
					      findTodaysFrontPage: jest.fn(),
 | 
				
			||||||
 | 
					      update: jest.fn(),
 | 
				
			||||||
 | 
					      delete: jest.fn(),
 | 
				
			||||||
 | 
					      deleteMany: jest.fn(),
 | 
				
			||||||
 | 
					      count: jest.fn(),
 | 
				
			||||||
 | 
					      exists: jest.fn()
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    mockWebScraper = new WebScraper() as jest.Mocked<WebScraper>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    mockScrapingService = new ScrapingService(mockFeedRepository) as jest.Mocked<ScrapingService>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Mock constructor calls
 | 
				
			||||||
 | 
					    (WebScraper as jest.MockedClass<typeof WebScraper>).mockImplementation(() => mockWebScraper);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    (ScrapingService as jest.MockedClass<typeof ScrapingService>).mockImplementation(() => mockScrapingService);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    contentScrapingService = new ContentScrapingService(mockFeedRepository);
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  describe('scrapeFromWebUrls', () => {
 | 
				
			||||||
 | 
					    test('should successfully scrape from web URLs', async () => {
 | 
				
			||||||
 | 
					      const mockScrapedData = [
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					          title: 'Web Article 1',
 | 
				
			||||||
 | 
					          description: 'Web Description 1',
 | 
				
			||||||
 | 
					          url: 'https://example.com/web1',
 | 
				
			||||||
 | 
					          publishedAt: new Date()
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					          title: 'Web Article 2',
 | 
				
			||||||
 | 
					          description: 'Web Description 2',
 | 
				
			||||||
 | 
					          url: 'https://example.com/web2',
 | 
				
			||||||
 | 
					          publishedAt: new Date()
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					      ];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const mockFeedData = mockScrapedData.map(data => ({
 | 
				
			||||||
 | 
					        ...data,
 | 
				
			||||||
 | 
					        source: NewsSource.EL_MUNDO,
 | 
				
			||||||
 | 
					        isManual: false
 | 
				
			||||||
 | 
					      }));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const mockResults = [
 | 
				
			||||||
 | 
					        { _id: '1', ...mockFeedData[0] },
 | 
				
			||||||
 | 
					        { _id: '2', ...mockFeedData[1] }
 | 
				
			||||||
 | 
					      ];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      mockWebScraper.scrapeUrl
 | 
				
			||||||
 | 
					        .mockResolvedValueOnce(mockScrapedData[0])
 | 
				
			||||||
 | 
					        .mockResolvedValueOnce(mockScrapedData[1]);
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      mockWebScraper.convertToFeedData
 | 
				
			||||||
 | 
					        .mockReturnValueOnce(mockFeedData[0])
 | 
				
			||||||
 | 
					        .mockReturnValueOnce(mockFeedData[1]);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      mockScrapingService.processFeedBatch.mockResolvedValue(mockResults);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const urls = ['https://example.com/web1', 'https://example.com/web2'];
 | 
				
			||||||
 | 
					      const result = await contentScrapingService.scrapeFromWebUrls(urls, NewsSource.EL_MUNDO);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      expect(mockWebScraper.scrapeUrl).toHaveBeenCalledTimes(2);
 | 
				
			||||||
 | 
					      expect(mockWebScraper.convertToFeedData).toHaveBeenCalledTimes(2);
 | 
				
			||||||
 | 
					      expect(mockScrapingService.processFeedBatch).toHaveBeenCalledWith(mockFeedData);
 | 
				
			||||||
 | 
					      expect(result).toEqual({
 | 
				
			||||||
 | 
					        success: 2,
 | 
				
			||||||
 | 
					        failed: 0,
 | 
				
			||||||
 | 
					        duplicates: 0,
 | 
				
			||||||
 | 
					        items: mockResults
 | 
				
			||||||
 | 
					      });
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    test('should handle failed web scraping', async () => {
 | 
				
			||||||
 | 
					      mockWebScraper.scrapeUrl
 | 
				
			||||||
 | 
					        .mockResolvedValueOnce(null)
 | 
				
			||||||
 | 
					        .mockRejectedValueOnce(new Error('Scraping failed'));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const urls = ['https://example.com/fail1', 'https://example.com/fail2'];
 | 
				
			||||||
 | 
					      const result = await contentScrapingService.scrapeFromWebUrls(urls, NewsSource.EL_MUNDO);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      expect(result).toEqual({
 | 
				
			||||||
 | 
					        success: 0,
 | 
				
			||||||
 | 
					        failed: 2,
 | 
				
			||||||
 | 
					        duplicates: 0,
 | 
				
			||||||
 | 
					        items: []
 | 
				
			||||||
 | 
					      });
 | 
				
			||||||
 | 
					      expect(mockScrapingService.processFeedBatch).not.toHaveBeenCalled();
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  describe('scrapeFromSource', () => {
 | 
				
			||||||
 | 
					    test('should scrape from web URLs', async () => {
 | 
				
			||||||
 | 
					      const config = {
 | 
				
			||||||
 | 
					        name: 'Test Source',
 | 
				
			||||||
 | 
					        source: NewsSource.EL_PAIS,
 | 
				
			||||||
 | 
					        webUrls: ['https://example.com/web1'],
 | 
				
			||||||
 | 
					        enabled: true
 | 
				
			||||||
 | 
					      };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const mockScrapedData = {
 | 
				
			||||||
 | 
					        title: 'Web Article',
 | 
				
			||||||
 | 
					        description: 'Web Description',
 | 
				
			||||||
 | 
					        url: 'https://example.com/web1',
 | 
				
			||||||
 | 
					        publishedAt: new Date()
 | 
				
			||||||
 | 
					      };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const mockWebFeedData = {
 | 
				
			||||||
 | 
					        ...mockScrapedData,
 | 
				
			||||||
 | 
					        source: NewsSource.EL_PAIS,
 | 
				
			||||||
 | 
					        isManual: false
 | 
				
			||||||
 | 
					      };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      // Mock web scraping
 | 
				
			||||||
 | 
					      mockWebScraper.scrapeUrl.mockResolvedValue(mockScrapedData);
 | 
				
			||||||
 | 
					      mockWebScraper.convertToFeedData.mockReturnValue(mockWebFeedData);
 | 
				
			||||||
 | 
					      mockScrapingService.processFeedBatch.mockResolvedValue([{ _id: '1', ...mockWebFeedData }]);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const result = await contentScrapingService.scrapeFromSource(config);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      expect(result).toEqual({
 | 
				
			||||||
 | 
					        success: 1,
 | 
				
			||||||
 | 
					        failed: 0,
 | 
				
			||||||
 | 
					        duplicates: 0,
 | 
				
			||||||
 | 
					        items: [{ _id: '1', ...mockWebFeedData }]
 | 
				
			||||||
 | 
					      });
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    test('should skip disabled sources', async () => {
 | 
				
			||||||
 | 
					      const config = {
 | 
				
			||||||
 | 
					        name: 'Disabled Source',
 | 
				
			||||||
 | 
					        source: NewsSource.EL_PAIS,
 | 
				
			||||||
 | 
					        webUrls: ['https://example.com/web1'],
 | 
				
			||||||
 | 
					        enabled: false
 | 
				
			||||||
 | 
					      };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const result = await contentScrapingService.scrapeFromSource(config);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      expect(result).toEqual({
 | 
				
			||||||
 | 
					        success: 0,
 | 
				
			||||||
 | 
					        failed: 0,
 | 
				
			||||||
 | 
					        duplicates: 0,
 | 
				
			||||||
 | 
					        items: []
 | 
				
			||||||
 | 
					      });
 | 
				
			||||||
 | 
					      expect(mockWebScraper.scrapeUrl).not.toHaveBeenCalled();
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  describe('scrapeFromMultipleSources', () => {
 | 
				
			||||||
 | 
					    test('should scrape from multiple sources', async () => {
 | 
				
			||||||
 | 
					      const configs = [
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					          name: 'Source 1',
 | 
				
			||||||
 | 
					          source: NewsSource.EL_PAIS,
 | 
				
			||||||
 | 
					          webUrls: ['https://example.com/web1'],
 | 
				
			||||||
 | 
					          enabled: true
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					          name: 'Source 2',
 | 
				
			||||||
 | 
					          source: NewsSource.EL_MUNDO,
 | 
				
			||||||
 | 
					          webUrls: ['https://example.com/web2'],
 | 
				
			||||||
 | 
					          enabled: true
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					      ];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const mockScrapedData1 = {
 | 
				
			||||||
 | 
					        title: 'Article 1',
 | 
				
			||||||
 | 
					        description: 'Description 1',
 | 
				
			||||||
 | 
					        url: 'https://example.com/web1',
 | 
				
			||||||
 | 
					        publishedAt: new Date()
 | 
				
			||||||
 | 
					      };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const mockScrapedData2 = {
 | 
				
			||||||
 | 
					        title: 'Article 2',
 | 
				
			||||||
 | 
					        description: 'Description 2',
 | 
				
			||||||
 | 
					        url: 'https://example.com/web2',
 | 
				
			||||||
 | 
					        publishedAt: new Date()
 | 
				
			||||||
 | 
					      };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const mockFeedData1 = { ...mockScrapedData1, source: NewsSource.EL_PAIS, isManual: false };
 | 
				
			||||||
 | 
					      const mockFeedData2 = { ...mockScrapedData2, source: NewsSource.EL_MUNDO, isManual: false };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      mockWebScraper.scrapeUrl
 | 
				
			||||||
 | 
					        .mockResolvedValueOnce(mockScrapedData1)
 | 
				
			||||||
 | 
					        .mockResolvedValueOnce(mockScrapedData2);
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      mockWebScraper.convertToFeedData
 | 
				
			||||||
 | 
					        .mockReturnValueOnce(mockFeedData1)
 | 
				
			||||||
 | 
					        .mockReturnValueOnce(mockFeedData2);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      mockScrapingService.processFeedBatch
 | 
				
			||||||
 | 
					        .mockResolvedValueOnce([{ _id: '1', ...mockFeedData1 }])
 | 
				
			||||||
 | 
					        .mockResolvedValueOnce([{ _id: '2', ...mockFeedData2 }]);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const results = await contentScrapingService.scrapeFromMultipleSources(configs);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      expect(results.size).toBe(2);
 | 
				
			||||||
 | 
					      expect(results.get('Source 1')).toEqual({
 | 
				
			||||||
 | 
					        success: 1,
 | 
				
			||||||
 | 
					        failed: 0,
 | 
				
			||||||
 | 
					        duplicates: 0,
 | 
				
			||||||
 | 
					        items: [{ _id: '1', ...mockFeedData1 }]
 | 
				
			||||||
 | 
					      });
 | 
				
			||||||
 | 
					      expect(results.get('Source 2')).toEqual({
 | 
				
			||||||
 | 
					        success: 1,
 | 
				
			||||||
 | 
					        failed: 0,
 | 
				
			||||||
 | 
					        duplicates: 0,
 | 
				
			||||||
 | 
					        items: [{ _id: '2', ...mockFeedData2 }]
 | 
				
			||||||
 | 
					      });
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  describe('createNewsSourceConfigs', () => {
 | 
				
			||||||
 | 
					    test('should create default news source configurations', () => {
 | 
				
			||||||
 | 
					      const configs = ContentScrapingService.createNewsSourceConfigs();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      expect(configs).toHaveLength(2);
 | 
				
			||||||
 | 
					      expect(configs[0]).toEqual({
 | 
				
			||||||
 | 
					        name: 'El País',
 | 
				
			||||||
 | 
					        source: NewsSource.EL_PAIS,
 | 
				
			||||||
 | 
					        enabled: true
 | 
				
			||||||
 | 
					      });
 | 
				
			||||||
 | 
					      expect(configs[1]).toEqual({
 | 
				
			||||||
 | 
					        name: 'El Mundo',
 | 
				
			||||||
 | 
					        source: NewsSource.EL_MUNDO,
 | 
				
			||||||
 | 
					        enabled: true
 | 
				
			||||||
 | 
					      });
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					});
 | 
				
			||||||
							
								
								
									
										108
									
								
								src/__tests__/FeedReaderService.test.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										108
									
								
								src/__tests__/FeedReaderService.test.ts
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,108 @@
 | 
				
			|||||||
 | 
					import { FeedReaderService } from '../services/FeedReaderService';
 | 
				
			||||||
 | 
					import { IFeedRepository } from '../repositories/FeedRepository';
 | 
				
			||||||
 | 
					import { NewsSource } from '../types/Feed';
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Mock dependencies
 | 
				
			||||||
 | 
					jest.mock('../utils/logger');
 | 
				
			||||||
 | 
					jest.mock('../services/ScrapingService');
 | 
				
			||||||
 | 
					jest.mock('../utils/WebScraper');
 | 
				
			||||||
 | 
					jest.mock('../extractors/ElPaisExtractor');
 | 
				
			||||||
 | 
					jest.mock('../extractors/ElMundoExtractor');
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Mock fetch globally
 | 
				
			||||||
 | 
					global.fetch = jest.fn();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					const mockFeedRepository: jest.Mocked<IFeedRepository> = {
 | 
				
			||||||
 | 
					  create: jest.fn(),
 | 
				
			||||||
 | 
					  findAll: jest.fn(),
 | 
				
			||||||
 | 
					  findById: jest.fn(),
 | 
				
			||||||
 | 
					  findByUrl: jest.fn(),
 | 
				
			||||||
 | 
					  update: jest.fn(),
 | 
				
			||||||
 | 
					  delete: jest.fn(),
 | 
				
			||||||
 | 
					  findBySource: jest.fn(),
 | 
				
			||||||
 | 
					  findTodaysFrontPage: jest.fn(),
 | 
				
			||||||
 | 
					  deleteMany: jest.fn(),
 | 
				
			||||||
 | 
					  count: jest.fn(),
 | 
				
			||||||
 | 
					  exists: jest.fn()
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Mock ScrapingService
 | 
				
			||||||
 | 
					const mockScrapingService = {
 | 
				
			||||||
 | 
					  processFeedBatch: jest.fn()
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					jest.mock('../services/ScrapingService', () => {
 | 
				
			||||||
 | 
					  return {
 | 
				
			||||||
 | 
					    ScrapingService: jest.fn().mockImplementation(() => mockScrapingService)
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					});
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Mock WebScraper
 | 
				
			||||||
 | 
					const mockWebScraper = {
 | 
				
			||||||
 | 
					  scrapeUrl: jest.fn(),
 | 
				
			||||||
 | 
					  convertToFeedData: jest.fn()
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					jest.mock('../utils/WebScraper', () => {
 | 
				
			||||||
 | 
					  return {
 | 
				
			||||||
 | 
					    WebScraper: jest.fn().mockImplementation(() => mockWebScraper)
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					});
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Mock extractors
 | 
				
			||||||
 | 
					const mockExtractor = {
 | 
				
			||||||
 | 
					  extractNews: jest.fn(),
 | 
				
			||||||
 | 
					  isEnabled: jest.fn().mockReturnValue(true),
 | 
				
			||||||
 | 
					  getName: jest.fn(),
 | 
				
			||||||
 | 
					  getSource: jest.fn()
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					const mockElPaisExtractor = {
 | 
				
			||||||
 | 
					  ...mockExtractor,
 | 
				
			||||||
 | 
					  getName: jest.fn().mockReturnValue('El País'),
 | 
				
			||||||
 | 
					  getSource: jest.fn().mockReturnValue(NewsSource.EL_PAIS)
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					const mockElMundoExtractor = {
 | 
				
			||||||
 | 
					  ...mockExtractor,
 | 
				
			||||||
 | 
					  getName: jest.fn().mockReturnValue('El Mundo'),
 | 
				
			||||||
 | 
					  getSource: jest.fn().mockReturnValue(NewsSource.EL_MUNDO)
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					jest.mock('../extractors/NewspaperExtractorFactory', () => ({
 | 
				
			||||||
 | 
					  NewspaperExtractorFactory: {
 | 
				
			||||||
 | 
					    getAllAvailableExtractors: jest.fn(() => [mockElPaisExtractor, mockElMundoExtractor]),
 | 
				
			||||||
 | 
					    createExtractor: jest.fn((source) => {
 | 
				
			||||||
 | 
					      if (source === NewsSource.EL_PAIS) return mockElPaisExtractor;
 | 
				
			||||||
 | 
					      if (source === NewsSource.EL_MUNDO) return mockElMundoExtractor;
 | 
				
			||||||
 | 
					      return null;
 | 
				
			||||||
 | 
					    })
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					describe('FeedReaderService', () => {
 | 
				
			||||||
 | 
					  let feedReaderService: FeedReaderService;
 | 
				
			||||||
 | 
					  const mockFetch = fetch as jest.MockedFunction<typeof fetch>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  beforeEach(() => {
 | 
				
			||||||
 | 
					    jest.clearAllMocks();
 | 
				
			||||||
 | 
					    feedReaderService = new FeedReaderService(mockFeedRepository);
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  describe('Constructor and Initialization', () => {
 | 
				
			||||||
 | 
					    it('should initialize with available extractors', () => {
 | 
				
			||||||
 | 
					      const newspapers = feedReaderService.getAvailableNewspapers();
 | 
				
			||||||
 | 
					      expect(newspapers).toHaveLength(2);
 | 
				
			||||||
 | 
					      expect(newspapers.map(n => n.source)).toContain(NewsSource.EL_PAIS);
 | 
				
			||||||
 | 
					      expect(newspapers.map(n => n.source)).toContain(NewsSource.EL_MUNDO);
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    it('should have all extractors enabled by default', () => {
 | 
				
			||||||
 | 
					      const newspapers = feedReaderService.getAvailableNewspapers();
 | 
				
			||||||
 | 
					      newspapers.forEach(newspaper => {
 | 
				
			||||||
 | 
					        expect(newspaper.enabled).toBe(true);
 | 
				
			||||||
 | 
					      });
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					});
 | 
				
			||||||
							
								
								
									
										317
									
								
								src/__tests__/ScrapingScheduler.test.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										317
									
								
								src/__tests__/ScrapingScheduler.test.ts
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,317 @@
 | 
				
			|||||||
 | 
					import { ScrapingScheduler } from '../services/ScrapingScheduler';
 | 
				
			||||||
 | 
					import { ContentScrapingService } from '../services/ContentScrapingService';
 | 
				
			||||||
 | 
					import { IFeedRepository } from '../repositories/FeedRepository';
 | 
				
			||||||
 | 
					import { NewsSource } from '../types/Feed';
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Mock dependencies
 | 
				
			||||||
 | 
					jest.mock('../services/ContentScrapingService');
 | 
				
			||||||
 | 
					jest.useFakeTimers();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					describe('ScrapingScheduler', () => {
 | 
				
			||||||
 | 
					  let scrapingScheduler: ScrapingScheduler;
 | 
				
			||||||
 | 
					  let mockFeedRepository: jest.Mocked<IFeedRepository>;
 | 
				
			||||||
 | 
					  let mockContentScrapingService: jest.Mocked<ContentScrapingService>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  beforeEach(() => {
 | 
				
			||||||
 | 
					    jest.clearAllMocks();
 | 
				
			||||||
 | 
					    jest.clearAllTimers();
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    mockFeedRepository = {
 | 
				
			||||||
 | 
					      create: jest.fn(),
 | 
				
			||||||
 | 
					      findAll: jest.fn(),
 | 
				
			||||||
 | 
					      findById: jest.fn(),
 | 
				
			||||||
 | 
					      findByUrl: jest.fn(),
 | 
				
			||||||
 | 
					      findBySource: jest.fn(),
 | 
				
			||||||
 | 
					      findTodaysFrontPage: jest.fn(),
 | 
				
			||||||
 | 
					      update: jest.fn(),
 | 
				
			||||||
 | 
					      delete: jest.fn(),
 | 
				
			||||||
 | 
					      deleteMany: jest.fn(),
 | 
				
			||||||
 | 
					      count: jest.fn(),
 | 
				
			||||||
 | 
					      exists: jest.fn()
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    mockContentScrapingService = {
 | 
				
			||||||
 | 
					      scrapeFromMultipleSources: jest.fn(),
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					      scrapeFromWebUrls: jest.fn(),
 | 
				
			||||||
 | 
					      scrapeFromSource: jest.fn()
 | 
				
			||||||
 | 
					    } as unknown as jest.Mocked<ContentScrapingService>;
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    // Mock ContentScrapingService constructor
 | 
				
			||||||
 | 
					    (ContentScrapingService as jest.MockedClass<typeof ContentScrapingService>)
 | 
				
			||||||
 | 
					      .mockImplementation(() => mockContentScrapingService);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Mock static method
 | 
				
			||||||
 | 
					    (ContentScrapingService.createNewsSourceConfigs as jest.Mock) = jest.fn().mockReturnValue([
 | 
				
			||||||
 | 
					      {
 | 
				
			||||||
 | 
					        name: 'El País',
 | 
				
			||||||
 | 
					        source: NewsSource.EL_PAIS,
 | 
				
			||||||
 | 
					        webUrls: ['https://elpais.com'],
 | 
				
			||||||
 | 
					        enabled: true
 | 
				
			||||||
 | 
					      },
 | 
				
			||||||
 | 
					      {
 | 
				
			||||||
 | 
					        name: 'El Mundo',
 | 
				
			||||||
 | 
					        source: NewsSource.EL_MUNDO,
 | 
				
			||||||
 | 
					        webUrls: ['https://elmundo.es'],
 | 
				
			||||||
 | 
					        enabled: true
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    ]);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    scrapingScheduler = new ScrapingScheduler(mockFeedRepository, {
 | 
				
			||||||
 | 
					      intervalMinutes: 1, // 1 minute for testing
 | 
				
			||||||
 | 
					      maxRetries: 2,
 | 
				
			||||||
 | 
					      retryDelayMinutes: 1,
 | 
				
			||||||
 | 
					      enabled: true
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  afterEach(() => {
 | 
				
			||||||
 | 
					    scrapingScheduler.stop();
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  describe('Basic Functionality', () => {
 | 
				
			||||||
 | 
					    test('should create ScrapingScheduler instance with default config', () => {
 | 
				
			||||||
 | 
					      const defaultScheduler = new ScrapingScheduler(mockFeedRepository);
 | 
				
			||||||
 | 
					      const config = defaultScheduler.getConfig();
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      expect(config).toEqual({
 | 
				
			||||||
 | 
					        intervalMinutes: 30,
 | 
				
			||||||
 | 
					        maxRetries: 3,
 | 
				
			||||||
 | 
					        retryDelayMinutes: 5,
 | 
				
			||||||
 | 
					        enabled: true
 | 
				
			||||||
 | 
					      });
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    test('should create ScrapingScheduler instance with custom config', () => {
 | 
				
			||||||
 | 
					      const customConfig = {
 | 
				
			||||||
 | 
					        intervalMinutes: 15,
 | 
				
			||||||
 | 
					        maxRetries: 5,
 | 
				
			||||||
 | 
					        retryDelayMinutes: 2,
 | 
				
			||||||
 | 
					        enabled: false
 | 
				
			||||||
 | 
					      };
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      const customScheduler = new ScrapingScheduler(mockFeedRepository, customConfig);
 | 
				
			||||||
 | 
					      const config = customScheduler.getConfig();
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      expect(config).toEqual(customConfig);
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    test('should initialize with empty stats', () => {
 | 
				
			||||||
 | 
					      const stats = scrapingScheduler.getStats();
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      expect(stats).toEqual({
 | 
				
			||||||
 | 
					        lastRun: null,
 | 
				
			||||||
 | 
					        nextRun: null,
 | 
				
			||||||
 | 
					        totalRuns: 0,
 | 
				
			||||||
 | 
					        successfulRuns: 0,
 | 
				
			||||||
 | 
					        failedRuns: 0,
 | 
				
			||||||
 | 
					        totalItemsScraped: 0,
 | 
				
			||||||
 | 
					        totalDuplicates: 0
 | 
				
			||||||
 | 
					      });
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  describe('Scheduler Control', () => {
 | 
				
			||||||
 | 
					    test('should start and stop scheduler', () => {
 | 
				
			||||||
 | 
					      expect(scrapingScheduler.isSchedulerRunning()).toBe(false);
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      scrapingScheduler.start();
 | 
				
			||||||
 | 
					      expect(scrapingScheduler.isSchedulerRunning()).toBe(true);
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      scrapingScheduler.stop();
 | 
				
			||||||
 | 
					      expect(scrapingScheduler.isSchedulerRunning()).toBe(false);
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    test('should not start if already running', () => {
 | 
				
			||||||
 | 
					      scrapingScheduler.start();
 | 
				
			||||||
 | 
					      const firstStart = scrapingScheduler.isSchedulerRunning();
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      scrapingScheduler.start(); // Try to start again
 | 
				
			||||||
 | 
					      const secondStart = scrapingScheduler.isSchedulerRunning();
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      expect(firstStart).toBe(true);
 | 
				
			||||||
 | 
					      expect(secondStart).toBe(true);
 | 
				
			||||||
 | 
					      expect(jest.getTimerCount()).toBe(1); // Only one timer should be active
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    test('should not start if disabled', () => {
 | 
				
			||||||
 | 
					      const disabledScheduler = new ScrapingScheduler(mockFeedRepository, { enabled: false });
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      disabledScheduler.start();
 | 
				
			||||||
 | 
					      expect(disabledScheduler.isSchedulerRunning()).toBe(false);
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  describe('Scraping Cycle', () => {
 | 
				
			||||||
 | 
					    test('should run successful scraping cycle', async () => {
 | 
				
			||||||
 | 
					      const mockResults = new Map([
 | 
				
			||||||
 | 
					        ['El País', { success: 5, failed: 0, duplicates: 2, items: [] }],
 | 
				
			||||||
 | 
					        ['El Mundo', { success: 3, failed: 0, duplicates: 1, items: [] }]
 | 
				
			||||||
 | 
					      ]);
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      mockContentScrapingService.scrapeFromMultipleSources.mockResolvedValue(mockResults);
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      await scrapingScheduler.runScrapingCycle();
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      const stats = scrapingScheduler.getStats();
 | 
				
			||||||
 | 
					      expect(stats.totalRuns).toBe(1);
 | 
				
			||||||
 | 
					      expect(stats.successfulRuns).toBe(1);
 | 
				
			||||||
 | 
					      expect(stats.failedRuns).toBe(0);
 | 
				
			||||||
 | 
					      expect(stats.totalItemsScraped).toBe(8); // 5 + 3
 | 
				
			||||||
 | 
					      expect(stats.totalDuplicates).toBe(3); // 2 + 1
 | 
				
			||||||
 | 
					      expect(stats.lastRun).toBeInstanceOf(Date);
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    test.skip('should handle scraping cycle errors with retries', async () => {
 | 
				
			||||||
 | 
					      mockContentScrapingService.scrapeFromMultipleSources
 | 
				
			||||||
 | 
					        .mockRejectedValueOnce(new Error('First attempt failed'))
 | 
				
			||||||
 | 
					        .mockRejectedValueOnce(new Error('Second attempt failed'))
 | 
				
			||||||
 | 
					        .mockResolvedValueOnce(new Map([
 | 
				
			||||||
 | 
					          ['El País', { success: 2, failed: 0, duplicates: 1, items: [] }]
 | 
				
			||||||
 | 
					        ]));
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      await scrapingScheduler.runScrapingCycle();
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      const stats = scrapingScheduler.getStats();
 | 
				
			||||||
 | 
					      expect(stats.totalRuns).toBe(1);
 | 
				
			||||||
 | 
					      expect(stats.successfulRuns).toBe(1);
 | 
				
			||||||
 | 
					      expect(stats.failedRuns).toBe(0);
 | 
				
			||||||
 | 
					      expect(mockContentScrapingService.scrapeFromMultipleSources).toHaveBeenCalledTimes(3);
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    test.skip('should fail after max retries', async () => {
 | 
				
			||||||
 | 
					      mockContentScrapingService.scrapeFromMultipleSources
 | 
				
			||||||
 | 
					        .mockRejectedValue(new Error('Persistent failure'));
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      await scrapingScheduler.runScrapingCycle();
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      const stats = scrapingScheduler.getStats();
 | 
				
			||||||
 | 
					      expect(stats.totalRuns).toBe(1);
 | 
				
			||||||
 | 
					      expect(stats.successfulRuns).toBe(0);
 | 
				
			||||||
 | 
					      expect(stats.failedRuns).toBe(1);
 | 
				
			||||||
 | 
					      expect(mockContentScrapingService.scrapeFromMultipleSources).toHaveBeenCalledTimes(3); // 1 + 2 retries
 | 
				
			||||||
 | 
					    }, 10000);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    test.skip('should not run concurrent cycles', async () => {
 | 
				
			||||||
 | 
					      let resolveFirst: () => void;
 | 
				
			||||||
 | 
					      const firstPromise = new Promise<void>(resolve => {
 | 
				
			||||||
 | 
					        resolveFirst = resolve;
 | 
				
			||||||
 | 
					      });
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      mockContentScrapingService.scrapeFromMultipleSources.mockImplementation(() => firstPromise.then(() => new Map()));
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      // Start first cycle
 | 
				
			||||||
 | 
					      const firstCycle = scrapingScheduler.runScrapingCycle();
 | 
				
			||||||
 | 
					      expect(scrapingScheduler.isCycleRunning()).toBe(true);
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      // Try to start second cycle while first is running
 | 
				
			||||||
 | 
					      const secondCycle = scrapingScheduler.runScrapingCycle();
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      // Resolve first cycle
 | 
				
			||||||
 | 
					      resolveFirst!();
 | 
				
			||||||
 | 
					      await firstCycle;
 | 
				
			||||||
 | 
					      await secondCycle;
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      const stats = scrapingScheduler.getStats();
 | 
				
			||||||
 | 
					      expect(stats.totalRuns).toBe(1); // Only one cycle should have run
 | 
				
			||||||
 | 
					      expect(mockContentScrapingService.scrapeFromMultipleSources).toHaveBeenCalledTimes(1);
 | 
				
			||||||
 | 
					    }, 10000);
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  describe('Single Source Scraping', () => {
 | 
				
			||||||
 | 
					    test('should run single source scraping successfully', async () => {
 | 
				
			||||||
 | 
					      const mockResult = { success: 3, failed: 0, duplicates: 1, items: [] };
 | 
				
			||||||
 | 
					      mockContentScrapingService.scrapeFromSource.mockResolvedValue(mockResult);
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      await scrapingScheduler.runSingleSource('El País');
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      expect(mockContentScrapingService.scrapeFromSource).toHaveBeenCalledWith({
 | 
				
			||||||
 | 
					        name: 'El País',
 | 
				
			||||||
 | 
					        source: NewsSource.EL_PAIS,
 | 
				
			||||||
 | 
					        webUrls: ['https://elpais.com'],
 | 
				
			||||||
 | 
					        enabled: true
 | 
				
			||||||
 | 
					      });
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    test('should handle unknown source name', async () => {
 | 
				
			||||||
 | 
					      await expect(scrapingScheduler.runSingleSource('Unknown Source'))
 | 
				
			||||||
 | 
					        .rejects.toThrow('Source configuration not found: Unknown Source');
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    test('should handle single source scraping errors', async () => {
 | 
				
			||||||
 | 
					      mockContentScrapingService.scrapeFromSource.mockRejectedValue(new Error('Scraping failed'));
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      await expect(scrapingScheduler.runSingleSource('El País'))
 | 
				
			||||||
 | 
					        .rejects.toThrow('Scraping failed');
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  describe('Configuration Management', () => {
 | 
				
			||||||
 | 
					    test('should update configuration', () => {
 | 
				
			||||||
 | 
					      const newConfig = {
 | 
				
			||||||
 | 
					        intervalMinutes: 60,
 | 
				
			||||||
 | 
					        maxRetries: 5
 | 
				
			||||||
 | 
					      };
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      scrapingScheduler.updateConfig(newConfig);
 | 
				
			||||||
 | 
					      const config = scrapingScheduler.getConfig();
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      expect(config.intervalMinutes).toBe(60);
 | 
				
			||||||
 | 
					      expect(config.maxRetries).toBe(5);
 | 
				
			||||||
 | 
					      expect(config.retryDelayMinutes).toBe(1); // Should keep existing value
 | 
				
			||||||
 | 
					      expect(config.enabled).toBe(true); // Should keep existing value
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    test('should restart scheduler when updating config while running', () => {
 | 
				
			||||||
 | 
					      scrapingScheduler.start();
 | 
				
			||||||
 | 
					      expect(scrapingScheduler.isSchedulerRunning()).toBe(true);
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      scrapingScheduler.updateConfig({ intervalMinutes: 60 });
 | 
				
			||||||
 | 
					      expect(scrapingScheduler.isSchedulerRunning()).toBe(true);
 | 
				
			||||||
 | 
					      expect(scrapingScheduler.getConfig().intervalMinutes).toBe(60);
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    test('should not restart scheduler when updating config while stopped', () => {
 | 
				
			||||||
 | 
					      expect(scrapingScheduler.isSchedulerRunning()).toBe(false);
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      scrapingScheduler.updateConfig({ intervalMinutes: 60 });
 | 
				
			||||||
 | 
					      expect(scrapingScheduler.isSchedulerRunning()).toBe(false);
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  describe('Statistics Management', () => {
 | 
				
			||||||
 | 
					    test('should reset statistics', () => {
 | 
				
			||||||
 | 
					      // Simulate some activity
 | 
				
			||||||
 | 
					      scrapingScheduler.start();
 | 
				
			||||||
 | 
					      const statsBeforeReset = scrapingScheduler.getStats();
 | 
				
			||||||
 | 
					      statsBeforeReset.totalRuns = 5;
 | 
				
			||||||
 | 
					      statsBeforeReset.successfulRuns = 3;
 | 
				
			||||||
 | 
					      statsBeforeReset.totalItemsScraped = 100;
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      scrapingScheduler.resetStats();
 | 
				
			||||||
 | 
					      const statsAfterReset = scrapingScheduler.getStats();
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      expect(statsAfterReset.totalRuns).toBe(0);
 | 
				
			||||||
 | 
					      expect(statsAfterReset.successfulRuns).toBe(0);
 | 
				
			||||||
 | 
					      expect(statsAfterReset.failedRuns).toBe(0);
 | 
				
			||||||
 | 
					      expect(statsAfterReset.totalItemsScraped).toBe(0);
 | 
				
			||||||
 | 
					      expect(statsAfterReset.totalDuplicates).toBe(0);
 | 
				
			||||||
 | 
					      expect(statsAfterReset.lastRun).toBeNull();
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  describe('Graceful Shutdown', () => {
 | 
				
			||||||
 | 
					    test('should shutdown gracefully when not running', async () => {
 | 
				
			||||||
 | 
					      await expect(scrapingScheduler.shutdown()).resolves.not.toThrow();
 | 
				
			||||||
 | 
					      expect(scrapingScheduler.isSchedulerRunning()).toBe(false);
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    test.skip('should shutdown gracefully when running', async () => {
 | 
				
			||||||
 | 
					      scrapingScheduler.start();
 | 
				
			||||||
 | 
					      expect(scrapingScheduler.isSchedulerRunning()).toBe(true);
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      await scrapingScheduler.shutdown();
 | 
				
			||||||
 | 
					      expect(scrapingScheduler.isSchedulerRunning()).toBe(false);
 | 
				
			||||||
 | 
					    }, 10000);
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					});
 | 
				
			||||||
							
								
								
									
										210
									
								
								src/__tests__/WebScraper.test.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										210
									
								
								src/__tests__/WebScraper.test.ts
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,210 @@
 | 
				
			|||||||
 | 
					import { WebScraper } from '../utils/WebScraper';
 | 
				
			||||||
 | 
					import { NewsSource } from '../types/Feed';
 | 
				
			||||||
 | 
					import { Logger } from '../utils/logger';
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Mock the Logger
 | 
				
			||||||
 | 
					jest.mock('../utils/logger', () => ({
 | 
				
			||||||
 | 
					  Logger: {
 | 
				
			||||||
 | 
					    error: jest.fn(),
 | 
				
			||||||
 | 
					    warn: jest.fn(),
 | 
				
			||||||
 | 
					    info: jest.fn(),
 | 
				
			||||||
 | 
					    debug: jest.fn()
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Mock fetch
 | 
				
			||||||
 | 
					global.fetch = jest.fn();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					describe('WebScraper', () => {
 | 
				
			||||||
 | 
					  let webScraper: WebScraper;
 | 
				
			||||||
 | 
					  const mockFetch = fetch as jest.MockedFunction<typeof fetch>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  beforeEach(() => {
 | 
				
			||||||
 | 
					    webScraper = new WebScraper();
 | 
				
			||||||
 | 
					    jest.clearAllMocks();
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  describe('scrapeUrl', () => {
 | 
				
			||||||
 | 
					    test('should successfully scrape a URL with complete metadata', async () => {
 | 
				
			||||||
 | 
					      const mockHtml = `
 | 
				
			||||||
 | 
					        <html>
 | 
				
			||||||
 | 
					          <head>
 | 
				
			||||||
 | 
					            <title>Test News Article</title>
 | 
				
			||||||
 | 
					            <meta property="og:title" content="Test News Article">
 | 
				
			||||||
 | 
					            <meta property="og:description" content="This is a test news article description">
 | 
				
			||||||
 | 
					            <meta property="article:published_time" content="2024-01-15T10:30:00Z">
 | 
				
			||||||
 | 
					          </head>
 | 
				
			||||||
 | 
					          <body>
 | 
				
			||||||
 | 
					            <h1>Test News Article</h1>
 | 
				
			||||||
 | 
					            <p>Article content here...</p>
 | 
				
			||||||
 | 
					          </body>
 | 
				
			||||||
 | 
					        </html>
 | 
				
			||||||
 | 
					      `;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      mockFetch.mockResolvedValue({
 | 
				
			||||||
 | 
					        ok: true,
 | 
				
			||||||
 | 
					        text: () => Promise.resolve(mockHtml)
 | 
				
			||||||
 | 
					      } as Response);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const result = await webScraper.scrapeUrl('https://example.com/news');
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      expect(result).toEqual({
 | 
				
			||||||
 | 
					        title: 'Test News Article',
 | 
				
			||||||
 | 
					        description: 'This is a test news article description',
 | 
				
			||||||
 | 
					        url: 'https://example.com/news',
 | 
				
			||||||
 | 
					        publishedAt: new Date('2024-01-15T10:30:00Z')
 | 
				
			||||||
 | 
					      });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      expect(mockFetch).toHaveBeenCalledWith('https://example.com/news', {
 | 
				
			||||||
 | 
					        headers: {
 | 
				
			||||||
 | 
					          'User-Agent': 'Mozilla/5.0 (compatible; DailyTrends/1.0)',
 | 
				
			||||||
 | 
					          'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					      });
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    test('should handle HTTP errors gracefully', async () => {
 | 
				
			||||||
 | 
					      mockFetch.mockResolvedValue({
 | 
				
			||||||
 | 
					        ok: false,
 | 
				
			||||||
 | 
					        status: 404,
 | 
				
			||||||
 | 
					        statusText: 'Not Found'
 | 
				
			||||||
 | 
					      } as Response);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const result = await webScraper.scrapeUrl('https://example.com/not-found');
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      expect(result).toBeNull();
 | 
				
			||||||
 | 
					      expect(Logger.error).toHaveBeenCalledWith(
 | 
				
			||||||
 | 
					        'Failed to fetch https://example.com/not-found: 404 Not Found'
 | 
				
			||||||
 | 
					      );
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    test('should handle network errors gracefully', async () => {
 | 
				
			||||||
 | 
					      mockFetch.mockRejectedValue(new Error('Network error'));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const result = await webScraper.scrapeUrl('https://example.com/error');
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      expect(result).toBeNull();
 | 
				
			||||||
 | 
					      expect(Logger.error).toHaveBeenCalledWith(
 | 
				
			||||||
 | 
					        'Error scraping https://example.com/error:',
 | 
				
			||||||
 | 
					        expect.any(Error)
 | 
				
			||||||
 | 
					      );
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    test('should return null when no title is found', async () => {
 | 
				
			||||||
 | 
					      const mockHtml = `
 | 
				
			||||||
 | 
					        <html>
 | 
				
			||||||
 | 
					          <head>
 | 
				
			||||||
 | 
					            <meta property="og:description" content="Description without title">
 | 
				
			||||||
 | 
					          </head>
 | 
				
			||||||
 | 
					          <body>
 | 
				
			||||||
 | 
					            <p>Content without title</p>
 | 
				
			||||||
 | 
					          </body>
 | 
				
			||||||
 | 
					        </html>
 | 
				
			||||||
 | 
					      `;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      mockFetch.mockResolvedValue({
 | 
				
			||||||
 | 
					        ok: true,
 | 
				
			||||||
 | 
					        text: () => Promise.resolve(mockHtml)
 | 
				
			||||||
 | 
					      } as Response);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const result = await webScraper.scrapeUrl('https://example.com/no-title');
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      expect(result).toBeNull();
 | 
				
			||||||
 | 
					      expect(Logger.warn).toHaveBeenCalledWith('No title found for https://example.com/no-title');
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    test('should return null when no description is found', async () => {
 | 
				
			||||||
 | 
					      const mockHtml = `
 | 
				
			||||||
 | 
					        <html>
 | 
				
			||||||
 | 
					          <head>
 | 
				
			||||||
 | 
					            <title>Title Only</title>
 | 
				
			||||||
 | 
					          </head>
 | 
				
			||||||
 | 
					          <body>
 | 
				
			||||||
 | 
					            <p>Content without description meta</p>
 | 
				
			||||||
 | 
					          </body>
 | 
				
			||||||
 | 
					        </html>
 | 
				
			||||||
 | 
					      `;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      mockFetch.mockResolvedValue({
 | 
				
			||||||
 | 
					        ok: true,
 | 
				
			||||||
 | 
					        text: () => Promise.resolve(mockHtml)
 | 
				
			||||||
 | 
					      } as Response);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const result = await webScraper.scrapeUrl('https://example.com/no-description');
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      expect(result).toBeNull();
 | 
				
			||||||
 | 
					      expect(Logger.warn).toHaveBeenCalledWith('No description found for https://example.com/no-description');
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    test('should use current date when no published date is found', async () => {
 | 
				
			||||||
 | 
					      const mockHtml = `
 | 
				
			||||||
 | 
					        <html>
 | 
				
			||||||
 | 
					          <head>
 | 
				
			||||||
 | 
					            <title>Test Article</title>
 | 
				
			||||||
 | 
					            <meta property="og:description" content="Test description">
 | 
				
			||||||
 | 
					          </head>
 | 
				
			||||||
 | 
					        </html>
 | 
				
			||||||
 | 
					      `;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      mockFetch.mockResolvedValue({
 | 
				
			||||||
 | 
					        ok: true,
 | 
				
			||||||
 | 
					        text: () => Promise.resolve(mockHtml)
 | 
				
			||||||
 | 
					      } as Response);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const beforeScrape = new Date();
 | 
				
			||||||
 | 
					      const result = await webScraper.scrapeUrl('https://example.com/no-date');
 | 
				
			||||||
 | 
					      const afterScrape = new Date();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      expect(result).not.toBeNull();
 | 
				
			||||||
 | 
					      expect(result!.publishedAt.getTime()).toBeGreaterThanOrEqual(beforeScrape.getTime());
 | 
				
			||||||
 | 
					      expect(result!.publishedAt.getTime()).toBeLessThanOrEqual(afterScrape.getTime());
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  describe('convertToFeedData', () => {
 | 
				
			||||||
 | 
					    test('should convert scraped data to feed format', () => {
 | 
				
			||||||
 | 
					    const scrapedData = {
 | 
				
			||||||
 | 
					      title: 'Test News',
 | 
				
			||||||
 | 
					      description: 'Test description',
 | 
				
			||||||
 | 
					      url: 'https://example.com/news',
 | 
				
			||||||
 | 
					      publishedAt: new Date('2024-01-15T10:00:00Z')
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    const feedData = webScraper.convertToFeedData(scrapedData, NewsSource.EL_PAIS);
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    expect(feedData).toEqual({
 | 
				
			||||||
 | 
					      title: 'Test News',
 | 
				
			||||||
 | 
					      description: 'Test description',
 | 
				
			||||||
 | 
					      url: 'https://example.com/news',
 | 
				
			||||||
 | 
					      source: NewsSource.EL_PAIS,
 | 
				
			||||||
 | 
					      publishedAt: new Date('2024-01-15T10:00:00Z'),
 | 
				
			||||||
 | 
					      isManual: false
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  test('should handle HTML with special characters and entities', async () => {
 | 
				
			||||||
 | 
					    const htmlWithEntities = `
 | 
				
			||||||
 | 
					      <html>
 | 
				
			||||||
 | 
					        <head>
 | 
				
			||||||
 | 
					          <title>News & Updates - El País</title>
 | 
				
			||||||
 | 
					          <meta name="description" content="Breaking news "today" & analysis">
 | 
				
			||||||
 | 
					        </head>
 | 
				
			||||||
 | 
					      </html>
 | 
				
			||||||
 | 
					    `;
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    global.fetch = jest.fn().mockResolvedValue({
 | 
				
			||||||
 | 
					      ok: true,
 | 
				
			||||||
 | 
					      text: () => Promise.resolve(htmlWithEntities)
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    const result = await webScraper.scrapeUrl('https://example.com/news');
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    expect(result).toEqual({
 | 
				
			||||||
 | 
					      title: 'News & Updates - El País',
 | 
				
			||||||
 | 
					      description: 'Breaking news "today" & analysis',
 | 
				
			||||||
 | 
					      url: 'https://example.com/news',
 | 
				
			||||||
 | 
					      publishedAt: expect.any(Date)
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					});
 | 
				
			||||||
 | 
					});
 | 
				
			||||||
							
								
								
									
										78
									
								
								src/extractors/BaseNewspaperExtractor.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										78
									
								
								src/extractors/BaseNewspaperExtractor.ts
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,78 @@
 | 
				
			|||||||
 | 
					import { WebScraper } from '../utils/WebScraper';
 | 
				
			||||||
 | 
					import { IFeed, NewsSource } from '../types/Feed';
 | 
				
			||||||
 | 
					import { NewspaperConfig } from '../types/NewspaperTypes';
 | 
				
			||||||
 | 
					import { Logger } from '../utils/logger';
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**
 | 
				
			||||||
 | 
					 * Clase abstracta base para extractores de periódicos
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					export abstract class BaseNewspaperExtractor {
 | 
				
			||||||
 | 
					  protected webScraper: WebScraper;
 | 
				
			||||||
 | 
					  protected config: NewspaperConfig;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  constructor(config: NewspaperConfig) {
 | 
				
			||||||
 | 
					    this.webScraper = new WebScraper();
 | 
				
			||||||
 | 
					    this.config = config;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /**
 | 
				
			||||||
 | 
					   * Método abstracto que debe implementar cada extractor específico
 | 
				
			||||||
 | 
					   */
 | 
				
			||||||
 | 
					  abstract extractFrontPageUrls(): Promise<string[]>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /**
 | 
				
			||||||
 | 
					   * Extrae noticias de las URLs de portada
 | 
				
			||||||
 | 
					   */
 | 
				
			||||||
 | 
					  async extractNews(): Promise<Omit<IFeed, '_id' | 'createdAt' | 'updatedAt'>[]> {
 | 
				
			||||||
 | 
					    try {
 | 
				
			||||||
 | 
					      Logger.info(`Extracting front page URLs for ${this.config.name}`);
 | 
				
			||||||
 | 
					      const urls = await this.extractFrontPageUrls();
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      if (urls.length === 0) {
 | 
				
			||||||
 | 
					        Logger.warn(`No URLs found for ${this.config.name}`);
 | 
				
			||||||
 | 
					        return [];
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      Logger.info(`Found ${urls.length} articles for ${this.config.name}`);
 | 
				
			||||||
 | 
					      const newsItems: Omit<IFeed, '_id' | 'createdAt' | 'updatedAt'>[] = [];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      for (const url of urls) {
 | 
				
			||||||
 | 
					        try {
 | 
				
			||||||
 | 
					          const scrapedData = await this.webScraper.scrapeUrl(url);
 | 
				
			||||||
 | 
					          if (scrapedData) {
 | 
				
			||||||
 | 
					            const feedItem = this.webScraper.convertToFeedData(scrapedData, this.config.source);
 | 
				
			||||||
 | 
					            newsItems.push(feedItem);
 | 
				
			||||||
 | 
					          }
 | 
				
			||||||
 | 
					        } catch (error) {
 | 
				
			||||||
 | 
					          Logger.error(`Error scraping article ${url}:`, error);
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      return newsItems;
 | 
				
			||||||
 | 
					    } catch (error) {
 | 
				
			||||||
 | 
					      Logger.error(`Error extracting news for ${this.config.name}:`, error);
 | 
				
			||||||
 | 
					      return [];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /**
 | 
				
			||||||
 | 
					   * Verifica si el extractor está habilitado
 | 
				
			||||||
 | 
					   */
 | 
				
			||||||
 | 
					  isEnabled(): boolean {
 | 
				
			||||||
 | 
					    return this.config.enabled;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /**
 | 
				
			||||||
 | 
					   * Obtiene el nombre del periódico
 | 
				
			||||||
 | 
					   */
 | 
				
			||||||
 | 
					  getName(): string {
 | 
				
			||||||
 | 
					    return this.config.name;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /**
 | 
				
			||||||
 | 
					   * Obtiene la fuente del periódico
 | 
				
			||||||
 | 
					   */
 | 
				
			||||||
 | 
					  getSource(): NewsSource {
 | 
				
			||||||
 | 
					    return this.config.source;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
							
								
								
									
										78
									
								
								src/extractors/ElMundoExtractor.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										78
									
								
								src/extractors/ElMundoExtractor.ts
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,78 @@
 | 
				
			|||||||
 | 
					import { BaseNewspaperExtractor } from './BaseNewspaperExtractor';
 | 
				
			||||||
 | 
					import { NewsSource } from '../types/Feed';
 | 
				
			||||||
 | 
					import { Logger } from '../utils/logger';
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**
 | 
				
			||||||
 | 
					 * Extractor específico para El Mundo
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					export class ElMundoExtractor extends BaseNewspaperExtractor {
 | 
				
			||||||
 | 
					  constructor() {
 | 
				
			||||||
 | 
					    super({
 | 
				
			||||||
 | 
					      name: 'El Mundo',
 | 
				
			||||||
 | 
					      source: NewsSource.EL_MUNDO,
 | 
				
			||||||
 | 
					      baseUrl: 'https://elmundo.es',
 | 
				
			||||||
 | 
					      frontPageUrl: 'https://elmundo.es',
 | 
				
			||||||
 | 
					      selectors: {
 | 
				
			||||||
 | 
					        articleLinks: '.ue-c-cover-content__link, .ue-c-cover-content__headline-link, h2 a, h3 a',
 | 
				
			||||||
 | 
					        titleSelector: 'h1, .ue-c-article__headline',
 | 
				
			||||||
 | 
					        descriptionSelector: '.ue-c-article__standfirst, .ue-c-cover-content__standfirst',
 | 
				
			||||||
 | 
					        dateSelector: '.ue-c-article__publishdate, time',
 | 
				
			||||||
 | 
					        imageSelector: '.ue-c-article__image img'
 | 
				
			||||||
 | 
					      },
 | 
				
			||||||
 | 
					      enabled: true
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  async extractFrontPageUrls(): Promise<string[]> {
 | 
				
			||||||
 | 
					    // Obtener HTML directamente usando fetch
 | 
				
			||||||
 | 
					    const response = await fetch(this.config.frontPageUrl, {
 | 
				
			||||||
 | 
					      headers: {
 | 
				
			||||||
 | 
					        'User-Agent': 'Mozilla/5.0 (compatible; DailyTrends/1.0)',
 | 
				
			||||||
 | 
					        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if (!response.ok) {
 | 
				
			||||||
 | 
					      Logger.error(`Failed to fetch ${this.config.frontPageUrl}: ${response.status}`);
 | 
				
			||||||
 | 
					      return [];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    const html = await response.text();
 | 
				
			||||||
 | 
					    if (!html) {
 | 
				
			||||||
 | 
					      return [];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    try {
 | 
				
			||||||
 | 
					      // Extraer enlaces de artículos usando regex
 | 
				
			||||||
 | 
					      const linkRegex = /<a[^>]+href=["']([^"']*(?:elmundo\.es)?[^"']*)["'][^>]*>.*?<\/a>/gi;
 | 
				
			||||||
 | 
					      const urls: string[] = [];
 | 
				
			||||||
 | 
					      let match;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      while ((match = linkRegex.exec(html)) !== null) {
 | 
				
			||||||
 | 
					        let url = match[1];
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        // Filtrar solo URLs de artículos relevantes
 | 
				
			||||||
 | 
					        if (url.includes('/espana/') || 
 | 
				
			||||||
 | 
					            url.includes('/internacional/') || 
 | 
				
			||||||
 | 
					            url.includes('/economia/') ||
 | 
				
			||||||
 | 
					            url.includes('/sociedad/') ||
 | 
				
			||||||
 | 
					            url.includes('/politica/')) {
 | 
				
			||||||
 | 
					          
 | 
				
			||||||
 | 
					          // Convertir URLs relativas a absolutas
 | 
				
			||||||
 | 
					          if (url.startsWith('/')) {
 | 
				
			||||||
 | 
					            url = this.config.baseUrl + url;
 | 
				
			||||||
 | 
					          }
 | 
				
			||||||
 | 
					          
 | 
				
			||||||
 | 
					          if (!urls.includes(url) && urls.length < 20) {
 | 
				
			||||||
 | 
					            urls.push(url);
 | 
				
			||||||
 | 
					          }
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      return urls;
 | 
				
			||||||
 | 
					    } catch (error) {
 | 
				
			||||||
 | 
					      Logger.error(`Error extracting El Mundo URLs:`, error);
 | 
				
			||||||
 | 
					      return [];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
							
								
								
									
										78
									
								
								src/extractors/ElPaisExtractor.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										78
									
								
								src/extractors/ElPaisExtractor.ts
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,78 @@
 | 
				
			|||||||
 | 
					import { BaseNewspaperExtractor } from './BaseNewspaperExtractor';
 | 
				
			||||||
 | 
					import { NewsSource } from '../types/Feed';
 | 
				
			||||||
 | 
					import { Logger } from '../utils/logger';
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**
 | 
				
			||||||
 | 
					 * Extractor específico para El País
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					export class ElPaisExtractor extends BaseNewspaperExtractor {
 | 
				
			||||||
 | 
					  constructor() {
 | 
				
			||||||
 | 
					    super({
 | 
				
			||||||
 | 
					      name: 'El País',
 | 
				
			||||||
 | 
					      source: NewsSource.EL_PAIS,
 | 
				
			||||||
 | 
					      baseUrl: 'https://elpais.com',
 | 
				
			||||||
 | 
					      frontPageUrl: 'https://elpais.com',
 | 
				
			||||||
 | 
					      selectors: {
 | 
				
			||||||
 | 
					        articleLinks: 'article h2 a, .c_t a, .articulo-titulo a, h2.articulo-titulo a',
 | 
				
			||||||
 | 
					        titleSelector: 'h1, .articulo-titulo',
 | 
				
			||||||
 | 
					        descriptionSelector: '.articulo-entradilla, .entradilla, .subtitulo',
 | 
				
			||||||
 | 
					        dateSelector: '.articulo-fecha, time',
 | 
				
			||||||
 | 
					        imageSelector: '.articulo-foto img, .foto img'
 | 
				
			||||||
 | 
					      },
 | 
				
			||||||
 | 
					      enabled: true
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  async extractFrontPageUrls(): Promise<string[]> {
 | 
				
			||||||
 | 
					    // Obtener HTML directamente usando fetch
 | 
				
			||||||
 | 
					    const response = await fetch(this.config.frontPageUrl, {
 | 
				
			||||||
 | 
					      headers: {
 | 
				
			||||||
 | 
					        'User-Agent': 'Mozilla/5.0 (compatible; DailyTrends/1.0)',
 | 
				
			||||||
 | 
					        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if (!response.ok) {
 | 
				
			||||||
 | 
					      Logger.error(`Failed to fetch ${this.config.frontPageUrl}: ${response.status}`);
 | 
				
			||||||
 | 
					      return [];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    const html = await response.text();
 | 
				
			||||||
 | 
					    if (!html) {
 | 
				
			||||||
 | 
					      return [];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    try {
 | 
				
			||||||
 | 
					      // Extraer enlaces de artículos usando regex
 | 
				
			||||||
 | 
					      const linkRegex = /<a[^>]+href=["']([^"']*(?:elpais\.com)?[^"']*)["'][^>]*>.*?<\/a>/gi;
 | 
				
			||||||
 | 
					      const urls: string[] = [];
 | 
				
			||||||
 | 
					      let match;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      while ((match = linkRegex.exec(html)) !== null) {
 | 
				
			||||||
 | 
					        let url = match[1];
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        // Filtrar solo URLs de artículos relevantes
 | 
				
			||||||
 | 
					        if (url.includes('/politica/') || 
 | 
				
			||||||
 | 
					            url.includes('/economia/') || 
 | 
				
			||||||
 | 
					            url.includes('/sociedad/') ||
 | 
				
			||||||
 | 
					            url.includes('/internacional/') ||
 | 
				
			||||||
 | 
					            url.includes('/espana/')) {
 | 
				
			||||||
 | 
					          
 | 
				
			||||||
 | 
					          // Convertir URLs relativas a absolutas
 | 
				
			||||||
 | 
					          if (url.startsWith('/')) {
 | 
				
			||||||
 | 
					            url = this.config.baseUrl + url;
 | 
				
			||||||
 | 
					          }
 | 
				
			||||||
 | 
					          
 | 
				
			||||||
 | 
					          if (!urls.includes(url) && urls.length < 20) {
 | 
				
			||||||
 | 
					            urls.push(url);
 | 
				
			||||||
 | 
					          }
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      return urls;
 | 
				
			||||||
 | 
					    } catch (error) {
 | 
				
			||||||
 | 
					      Logger.error(`Error extracting El País URLs:`, error);
 | 
				
			||||||
 | 
					      return [];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
							
								
								
									
										37
									
								
								src/extractors/NewspaperExtractorFactory.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								src/extractors/NewspaperExtractorFactory.ts
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,37 @@
 | 
				
			|||||||
 | 
					import { BaseNewspaperExtractor } from './BaseNewspaperExtractor';
 | 
				
			||||||
 | 
					import { ElPaisExtractor } from './ElPaisExtractor';
 | 
				
			||||||
 | 
					import { ElMundoExtractor } from './ElMundoExtractor';
 | 
				
			||||||
 | 
					import { NewsSource } from '../types/Feed';
 | 
				
			||||||
 | 
					import { Logger } from '../utils/logger';
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**
 | 
				
			||||||
 | 
					 * Factory para crear extractores de periódicos
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					export class NewspaperExtractorFactory {
 | 
				
			||||||
 | 
					  static createExtractor(source: NewsSource): BaseNewspaperExtractor | null {
 | 
				
			||||||
 | 
					    switch (source) {
 | 
				
			||||||
 | 
					      case NewsSource.EL_PAIS:
 | 
				
			||||||
 | 
					        return new ElPaisExtractor();
 | 
				
			||||||
 | 
					      case NewsSource.EL_MUNDO:
 | 
				
			||||||
 | 
					        return new ElMundoExtractor();
 | 
				
			||||||
 | 
					      default:
 | 
				
			||||||
 | 
					        Logger.warn(`No extractor available for source: ${source}`);
 | 
				
			||||||
 | 
					        return null;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  static getAllAvailableExtractors(): BaseNewspaperExtractor[] {
 | 
				
			||||||
 | 
					    const extractors: BaseNewspaperExtractor[] = [];
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    for (const source of Object.values(NewsSource)) {
 | 
				
			||||||
 | 
					      if (source !== NewsSource.MANUAL) {
 | 
				
			||||||
 | 
					        const extractor = this.createExtractor(source);
 | 
				
			||||||
 | 
					        if (extractor) {
 | 
				
			||||||
 | 
					          extractors.push(extractor);
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    return extractors;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
							
								
								
									
										61
									
								
								src/scraper.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										61
									
								
								src/scraper.ts
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,61 @@
 | 
				
			|||||||
 | 
					import { ScrapingScheduler } from './services/ScrapingScheduler.js';
 | 
				
			||||||
 | 
					import { FeedRepository } from './repositories/FeedRepository.js';
 | 
				
			||||||
 | 
					import { DatabaseConnection } from './config/database.js';
 | 
				
			||||||
 | 
					import { Logger } from './utils/logger.js';
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					let scheduler: ScrapingScheduler;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					async function initializeScraper() {
 | 
				
			||||||
 | 
					  try {
 | 
				
			||||||
 | 
					    // Connect to database
 | 
				
			||||||
 | 
					    await DatabaseConnection.getInstance().connect();
 | 
				
			||||||
 | 
					    Logger.database.connected();
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    // Initialize repository and scheduler
 | 
				
			||||||
 | 
					    const feedRepository = new FeedRepository();
 | 
				
			||||||
 | 
					    scheduler = new ScrapingScheduler(feedRepository, {
 | 
				
			||||||
 | 
					      intervalMinutes: 30, // Run every 30 minutes
 | 
				
			||||||
 | 
					      maxRetries: 2,
 | 
				
			||||||
 | 
					      retryDelayMinutes: 5,
 | 
				
			||||||
 | 
					      enabled: true
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    // Start the scheduler
 | 
				
			||||||
 | 
					    scheduler.start();
 | 
				
			||||||
 | 
					    Logger.info('Scraping scheduler started successfully');
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    // Log initial stats
 | 
				
			||||||
 | 
					    const stats = scheduler.getStats();
 | 
				
			||||||
 | 
					    Logger.info('Initial scheduler stats', stats);
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					  } catch (error) {
 | 
				
			||||||
 | 
					    Logger.error('Failed to start scraper', { error });
 | 
				
			||||||
 | 
					    process.exit(1);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					const shutdown = async () => {
 | 
				
			||||||
 | 
					  try {
 | 
				
			||||||
 | 
					    if (scheduler) {
 | 
				
			||||||
 | 
					      await scheduler.shutdown();
 | 
				
			||||||
 | 
					      Logger.info('Scraping scheduler stopped');
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    await DatabaseConnection.getInstance().disconnect();
 | 
				
			||||||
 | 
					    Logger.database.disconnected();
 | 
				
			||||||
 | 
					    process.exit(0);
 | 
				
			||||||
 | 
					  } catch (error) {
 | 
				
			||||||
 | 
					    Logger.error('Error during scraper shutdown', { error });
 | 
				
			||||||
 | 
					    process.exit(1);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Handle graceful shutdown
 | 
				
			||||||
 | 
					process.on('SIGINT', shutdown);
 | 
				
			||||||
 | 
					process.on('SIGTERM', shutdown);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Start the scraper
 | 
				
			||||||
 | 
					initializeScraper().catch(error => {
 | 
				
			||||||
 | 
					  Logger.error('Failed to initialize scraper', { error });
 | 
				
			||||||
 | 
					  process.exit(1);
 | 
				
			||||||
 | 
					});
 | 
				
			||||||
							
								
								
									
										156
									
								
								src/services/ContentScrapingService.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										156
									
								
								src/services/ContentScrapingService.ts
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,156 @@
 | 
				
			|||||||
 | 
					import { WebScraper } from '../utils/WebScraper.js';
 | 
				
			||||||
 | 
					import { ScrapingService } from './ScrapingService.js';
 | 
				
			||||||
 | 
					import { IFeed, NewsSource } from '../types/Feed.js';
 | 
				
			||||||
 | 
					import { IFeedRepository } from '../repositories/FeedRepository.js';
 | 
				
			||||||
 | 
					import { Logger } from '../utils/logger.js';
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					interface ScrapingResult {
 | 
				
			||||||
 | 
					  success: number;
 | 
				
			||||||
 | 
					  failed: number;
 | 
				
			||||||
 | 
					  duplicates: number;
 | 
				
			||||||
 | 
					  items: (IFeed | null)[];
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					interface NewsSourceConfig {
 | 
				
			||||||
 | 
					  name: string;
 | 
				
			||||||
 | 
					  source: NewsSource;
 | 
				
			||||||
 | 
					  webUrls?: string[];
 | 
				
			||||||
 | 
					  enabled: boolean;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export class ContentScrapingService {
 | 
				
			||||||
 | 
					  private webScraper: WebScraper;
 | 
				
			||||||
 | 
					  private scrapingService: ScrapingService;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  constructor(feedRepository: IFeedRepository) {
 | 
				
			||||||
 | 
					    this.webScraper = new WebScraper();
 | 
				
			||||||
 | 
					    this.scrapingService = new ScrapingService(feedRepository);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  async scrapeFromWebUrls(urls: string[], source: NewsSource): Promise<ScrapingResult> {
 | 
				
			||||||
 | 
					    Logger.info(`Starting web scraping from ${urls.length} URLs for ${source}`);
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    const feedItems: Omit<IFeed, '_id' | 'createdAt' | 'updatedAt'>[] = [];
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    for (const url of urls) {
 | 
				
			||||||
 | 
					      try {
 | 
				
			||||||
 | 
					        const scrapedData = await this.webScraper.scrapeUrl(url);
 | 
				
			||||||
 | 
					        if (scrapedData) {
 | 
				
			||||||
 | 
					          const feedData = this.webScraper.convertToFeedData(scrapedData, source);
 | 
				
			||||||
 | 
					          feedItems.push(feedData);
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					      } catch (error) {
 | 
				
			||||||
 | 
					        Logger.error(`Error scraping URL ${url}:`, error);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if (feedItems.length === 0) {
 | 
				
			||||||
 | 
					      Logger.warn(`No items scraped from web URLs`);
 | 
				
			||||||
 | 
					      return { success: 0, failed: urls.length, duplicates: 0, items: [] };
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    const results = await this.scrapingService.processFeedBatch(feedItems);
 | 
				
			||||||
 | 
					    return this.analyzeResults(results);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  async scrapeFromSource(config: NewsSourceConfig): Promise<ScrapingResult> {
 | 
				
			||||||
 | 
					    if (!config.enabled) {
 | 
				
			||||||
 | 
					      Logger.info(`Skipping disabled source: ${config.name}`);
 | 
				
			||||||
 | 
					      return { success: 0, failed: 0, duplicates: 0, items: [] };
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Logger.info(`Starting content scraping for source: ${config.name}`);
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    let totalResult: ScrapingResult = { success: 0, failed: 0, duplicates: 0, items: [] };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Scrape from web URLs if available
 | 
				
			||||||
 | 
					    if (config.webUrls && config.webUrls.length > 0) {
 | 
				
			||||||
 | 
					      const webResult = await this.scrapeFromWebUrls(config.webUrls, config.source);
 | 
				
			||||||
 | 
					      totalResult = this.mergeResults(totalResult, webResult);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Logger.info(`Completed scraping for ${config.name}: ${totalResult.success} success, ${totalResult.failed} failed, ${totalResult.duplicates} duplicates`);
 | 
				
			||||||
 | 
					    return totalResult;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  async scrapeFromMultipleSources(configs: NewsSourceConfig[]): Promise<Map<string, ScrapingResult>> {
 | 
				
			||||||
 | 
					    Logger.info(`Starting batch scraping from ${configs.length} sources`);
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    const results = new Map<string, ScrapingResult>();
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    for (const config of configs) {
 | 
				
			||||||
 | 
					      try {
 | 
				
			||||||
 | 
					        const result = await this.scrapeFromSource(config);
 | 
				
			||||||
 | 
					        results.set(config.name, result);
 | 
				
			||||||
 | 
					      } catch (error) {
 | 
				
			||||||
 | 
					        Logger.error(`Error scraping source ${config.name}:`, error);
 | 
				
			||||||
 | 
					        results.set(config.name, { success: 0, failed: 1, duplicates: 0, items: [] });
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    const totalStats = this.calculateTotalStats(results);
 | 
				
			||||||
 | 
					    Logger.info(`Batch scraping completed: ${totalStats.success} total success, ${totalStats.failed} total failed, ${totalStats.duplicates} total duplicates`);
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    return results;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  private analyzeResults(results: (IFeed | null)[]): ScrapingResult {
 | 
				
			||||||
 | 
					    const success = results.filter(item => item !== null).length;
 | 
				
			||||||
 | 
					    const duplicates = results.filter(item => item === null).length;
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    return {
 | 
				
			||||||
 | 
					      success,
 | 
				
			||||||
 | 
					      failed: 0, // processFeedBatch doesn't fail individual items, it throws on repository errors
 | 
				
			||||||
 | 
					      duplicates,
 | 
				
			||||||
 | 
					      items: results
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  private mergeResults(result1: ScrapingResult, result2: ScrapingResult): ScrapingResult {
 | 
				
			||||||
 | 
					    return {
 | 
				
			||||||
 | 
					      success: result1.success + result2.success,
 | 
				
			||||||
 | 
					      failed: result1.failed + result2.failed,
 | 
				
			||||||
 | 
					      duplicates: result1.duplicates + result2.duplicates,
 | 
				
			||||||
 | 
					      items: [...result1.items, ...result2.items]
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  private calculateTotalStats(results: Map<string, ScrapingResult>): ScrapingResult {
 | 
				
			||||||
 | 
					    let totalSuccess = 0;
 | 
				
			||||||
 | 
					    let totalFailed = 0;
 | 
				
			||||||
 | 
					    let totalDuplicates = 0;
 | 
				
			||||||
 | 
					    const allItems: (IFeed | null)[] = [];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for (const result of results.values()) {
 | 
				
			||||||
 | 
					      totalSuccess += result.success;
 | 
				
			||||||
 | 
					      totalFailed += result.failed;
 | 
				
			||||||
 | 
					      totalDuplicates += result.duplicates;
 | 
				
			||||||
 | 
					      allItems.push(...result.items);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return {
 | 
				
			||||||
 | 
					      success: totalSuccess,
 | 
				
			||||||
 | 
					      failed: totalFailed,
 | 
				
			||||||
 | 
					      duplicates: totalDuplicates,
 | 
				
			||||||
 | 
					      items: allItems
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Utility method to create common news source configurations
 | 
				
			||||||
 | 
					  static createNewsSourceConfigs(): NewsSourceConfig[] {
 | 
				
			||||||
 | 
					    return [
 | 
				
			||||||
 | 
					      {
 | 
				
			||||||
 | 
					        name: 'El País',
 | 
				
			||||||
 | 
					        source: NewsSource.EL_PAIS,
 | 
				
			||||||
 | 
					        enabled: true
 | 
				
			||||||
 | 
					      },
 | 
				
			||||||
 | 
					      {
 | 
				
			||||||
 | 
					        name: 'El Mundo',
 | 
				
			||||||
 | 
					        source: NewsSource.EL_MUNDO,
 | 
				
			||||||
 | 
					        enabled: true
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    ];
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
							
								
								
									
										193
									
								
								src/services/FeedReaderService.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										193
									
								
								src/services/FeedReaderService.ts
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,193 @@
 | 
				
			|||||||
 | 
					import { ScrapingService } from './ScrapingService';
 | 
				
			||||||
 | 
					import { IFeed, NewsSource } from '../types/Feed';
 | 
				
			||||||
 | 
					import { IFeedRepository } from '../repositories/FeedRepository';
 | 
				
			||||||
 | 
					import { Logger } from '../utils/logger';
 | 
				
			||||||
 | 
					import { BaseNewspaperExtractor } from '../extractors/BaseNewspaperExtractor';
 | 
				
			||||||
 | 
					import { NewspaperExtractorFactory } from '../extractors/NewspaperExtractorFactory';
 | 
				
			||||||
 | 
					import { ScrapingResult } from '../types/NewspaperTypes';
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**
 | 
				
			||||||
 | 
					 * Servicio principal de lectura de feeds mediante web scraping
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					export class FeedReaderService {
 | 
				
			||||||
 | 
					  private scrapingService: ScrapingService;
 | 
				
			||||||
 | 
					  private extractors: Map<NewsSource, BaseNewspaperExtractor>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  constructor(feedRepository: IFeedRepository) {
 | 
				
			||||||
 | 
					    this.scrapingService = new ScrapingService(feedRepository);
 | 
				
			||||||
 | 
					    this.extractors = new Map();
 | 
				
			||||||
 | 
					    this.initializeExtractors();
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /**
 | 
				
			||||||
 | 
					   * Inicializa todos los extractores disponibles
 | 
				
			||||||
 | 
					   */
 | 
				
			||||||
 | 
					  private initializeExtractors(): void {
 | 
				
			||||||
 | 
					    const availableExtractors = NewspaperExtractorFactory.getAllAvailableExtractors();
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    for (const extractor of availableExtractors) {
 | 
				
			||||||
 | 
					      this.extractors.set(extractor.getSource(), extractor);
 | 
				
			||||||
 | 
					      Logger.info(`Initialized extractor for ${extractor.getName()}`);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /**
 | 
				
			||||||
 | 
					   * Extrae noticias de un periódico específico
 | 
				
			||||||
 | 
					   */
 | 
				
			||||||
 | 
					  async extractFromNewspaper(source: NewsSource): Promise<ScrapingResult> {
 | 
				
			||||||
 | 
					    const extractor = this.extractors.get(source);
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    if (!extractor) {
 | 
				
			||||||
 | 
					      const error = `No extractor found for source: ${source}`;
 | 
				
			||||||
 | 
					      Logger.error(error);
 | 
				
			||||||
 | 
					      return {
 | 
				
			||||||
 | 
					        success: 0,
 | 
				
			||||||
 | 
					        failed: 1,
 | 
				
			||||||
 | 
					        duplicates: 0,
 | 
				
			||||||
 | 
					        items: [],
 | 
				
			||||||
 | 
					        errors: [error]
 | 
				
			||||||
 | 
					      };
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if (!extractor.isEnabled()) {
 | 
				
			||||||
 | 
					      Logger.info(`Skipping disabled extractor: ${extractor.getName()}`);
 | 
				
			||||||
 | 
					      return {
 | 
				
			||||||
 | 
					        success: 0,
 | 
				
			||||||
 | 
					        failed: 0,
 | 
				
			||||||
 | 
					        duplicates: 0,
 | 
				
			||||||
 | 
					        items: [],
 | 
				
			||||||
 | 
					        errors: []
 | 
				
			||||||
 | 
					      };
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    try {
 | 
				
			||||||
 | 
					      Logger.info(`Starting extraction for ${extractor.getName()}`);
 | 
				
			||||||
 | 
					      const newsItems = await extractor.extractNews();
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      if (newsItems.length === 0) {
 | 
				
			||||||
 | 
					        Logger.warn(`No news items extracted for ${extractor.getName()}`);
 | 
				
			||||||
 | 
					        return {
 | 
				
			||||||
 | 
					          success: 0,
 | 
				
			||||||
 | 
					          failed: 0,
 | 
				
			||||||
 | 
					          duplicates: 0,
 | 
				
			||||||
 | 
					          items: [],
 | 
				
			||||||
 | 
					          errors: []
 | 
				
			||||||
 | 
					        };
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const results = await this.scrapingService.processFeedBatch(newsItems);
 | 
				
			||||||
 | 
					      const analyzed = this.analyzeResults(results);
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      Logger.info(`Completed extraction for ${extractor.getName()}: ${analyzed.success} success, ${analyzed.failed} failed, ${analyzed.duplicates} duplicates`);
 | 
				
			||||||
 | 
					      return analyzed;
 | 
				
			||||||
 | 
					    } catch (error) {
 | 
				
			||||||
 | 
					      const errorMsg = `Error extracting from ${extractor.getName()}: ${error}`;
 | 
				
			||||||
 | 
					      Logger.error(errorMsg);
 | 
				
			||||||
 | 
					      return {
 | 
				
			||||||
 | 
					        success: 0,
 | 
				
			||||||
 | 
					        failed: 1,
 | 
				
			||||||
 | 
					        duplicates: 0,
 | 
				
			||||||
 | 
					        items: [],
 | 
				
			||||||
 | 
					        errors: [errorMsg]
 | 
				
			||||||
 | 
					      };
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /**
 | 
				
			||||||
 | 
					   * Extrae noticias de todos los periódicos disponibles
 | 
				
			||||||
 | 
					   */
 | 
				
			||||||
 | 
					  async extractFromAllNewspapers(): Promise<Map<NewsSource, ScrapingResult>> {
 | 
				
			||||||
 | 
					    Logger.info(`Starting batch extraction from ${this.extractors.size} newspapers`);
 | 
				
			||||||
 | 
					    const results = new Map<NewsSource, ScrapingResult>();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for (const [source, extractor] of this.extractors) {
 | 
				
			||||||
 | 
					      if (extractor.isEnabled()) {
 | 
				
			||||||
 | 
					        const result = await this.extractFromNewspaper(source);
 | 
				
			||||||
 | 
					        results.set(source, result);
 | 
				
			||||||
 | 
					      } else {
 | 
				
			||||||
 | 
					        Logger.info(`Skipping disabled newspaper: ${extractor.getName()}`);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    const totalStats = this.calculateTotalStats(results);
 | 
				
			||||||
 | 
					    Logger.info(`Batch extraction completed: ${totalStats.success} total success, ${totalStats.failed} total failed, ${totalStats.duplicates} total duplicates`);
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    return results;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /**
 | 
				
			||||||
 | 
					   * Obtiene la lista de periódicos disponibles
 | 
				
			||||||
 | 
					   */
 | 
				
			||||||
 | 
					  getAvailableNewspapers(): { source: NewsSource; name: string; enabled: boolean }[] {
 | 
				
			||||||
 | 
					    const newspapers: { source: NewsSource; name: string; enabled: boolean }[] = [];
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    for (const [source, extractor] of this.extractors) {
 | 
				
			||||||
 | 
					      newspapers.push({
 | 
				
			||||||
 | 
					        source,
 | 
				
			||||||
 | 
					        name: extractor.getName(),
 | 
				
			||||||
 | 
					        enabled: extractor.isEnabled()
 | 
				
			||||||
 | 
					      });
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    return newspapers;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /**
 | 
				
			||||||
 | 
					   * Habilita o deshabilita un extractor específico
 | 
				
			||||||
 | 
					   */
 | 
				
			||||||
 | 
					  setExtractorEnabled(source: NewsSource, enabled: boolean): boolean {
 | 
				
			||||||
 | 
					    const extractor = this.extractors.get(source);
 | 
				
			||||||
 | 
					    if (!extractor) {
 | 
				
			||||||
 | 
					      Logger.error(`Cannot set enabled state: No extractor found for source ${source}`);
 | 
				
			||||||
 | 
					      return false;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Nota: En una implementación real, esto podría modificar la configuración
 | 
				
			||||||
 | 
					    // Por ahora, solo registramos el cambio
 | 
				
			||||||
 | 
					    Logger.info(`${enabled ? 'Enabled' : 'Disabled'} extractor for ${extractor.getName()}`);
 | 
				
			||||||
 | 
					    return true;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /**
 | 
				
			||||||
 | 
					   * Analiza los resultados del procesamiento
 | 
				
			||||||
 | 
					   */
 | 
				
			||||||
 | 
					  private analyzeResults(results: (IFeed | null)[]): ScrapingResult {
 | 
				
			||||||
 | 
					    const success = results.filter(item => item !== null).length;
 | 
				
			||||||
 | 
					    const failed = results.filter(item => item === null).length;
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    return {
 | 
				
			||||||
 | 
					      success,
 | 
				
			||||||
 | 
					      failed,
 | 
				
			||||||
 | 
					      duplicates: 0, // El ScrapingService maneja duplicados internamente
 | 
				
			||||||
 | 
					      items: results,
 | 
				
			||||||
 | 
					      errors: []
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  /**
 | 
				
			||||||
 | 
					   * Calcula estadísticas totales de múltiples resultados
 | 
				
			||||||
 | 
					   */
 | 
				
			||||||
 | 
					  private calculateTotalStats(results: Map<NewsSource, ScrapingResult>): ScrapingResult {
 | 
				
			||||||
 | 
					    let totalSuccess = 0;
 | 
				
			||||||
 | 
					    let totalFailed = 0;
 | 
				
			||||||
 | 
					    let totalDuplicates = 0;
 | 
				
			||||||
 | 
					    const allItems: (IFeed | null)[] = [];
 | 
				
			||||||
 | 
					    const allErrors: string[] = [];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for (const result of results.values()) {
 | 
				
			||||||
 | 
					      totalSuccess += result.success;
 | 
				
			||||||
 | 
					      totalFailed += result.failed;
 | 
				
			||||||
 | 
					      totalDuplicates += result.duplicates;
 | 
				
			||||||
 | 
					      allItems.push(...result.items);
 | 
				
			||||||
 | 
					      allErrors.push(...result.errors);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return {
 | 
				
			||||||
 | 
					      success: totalSuccess,
 | 
				
			||||||
 | 
					      failed: totalFailed,
 | 
				
			||||||
 | 
					      duplicates: totalDuplicates,
 | 
				
			||||||
 | 
					      items: allItems,
 | 
				
			||||||
 | 
					      errors: allErrors
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
							
								
								
									
										225
									
								
								src/services/ScrapingScheduler.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										225
									
								
								src/services/ScrapingScheduler.ts
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,225 @@
 | 
				
			|||||||
 | 
					import { ContentScrapingService } from './ContentScrapingService.js';
 | 
				
			||||||
 | 
					import { IFeedRepository } from '../repositories/FeedRepository.js';
 | 
				
			||||||
 | 
					import { Logger } from '../utils/logger.js';
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					interface ScheduleConfig {
 | 
				
			||||||
 | 
					  intervalMinutes: number;
 | 
				
			||||||
 | 
					  maxRetries: number;
 | 
				
			||||||
 | 
					  retryDelayMinutes: number;
 | 
				
			||||||
 | 
					  enabled: boolean;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					interface ScrapingStats {
 | 
				
			||||||
 | 
					  lastRun: Date | null;
 | 
				
			||||||
 | 
					  nextRun: Date | null;
 | 
				
			||||||
 | 
					  totalRuns: number;
 | 
				
			||||||
 | 
					  successfulRuns: number;
 | 
				
			||||||
 | 
					  failedRuns: number;
 | 
				
			||||||
 | 
					  totalItemsScraped: number;
 | 
				
			||||||
 | 
					  totalDuplicates: number;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export class ScrapingScheduler {
 | 
				
			||||||
 | 
					  private contentScrapingService: ContentScrapingService;
 | 
				
			||||||
 | 
					  private scheduleConfig: ScheduleConfig;
 | 
				
			||||||
 | 
					  private stats: ScrapingStats;
 | 
				
			||||||
 | 
					  private intervalId: NodeJS.Timeout | null = null;
 | 
				
			||||||
 | 
					  private isRunning = false;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  constructor(
 | 
				
			||||||
 | 
					    feedRepository: IFeedRepository,
 | 
				
			||||||
 | 
					    scheduleConfig: Partial<ScheduleConfig> = {}
 | 
				
			||||||
 | 
					  ) {
 | 
				
			||||||
 | 
					    this.contentScrapingService = new ContentScrapingService(feedRepository);
 | 
				
			||||||
 | 
					    this.scheduleConfig = {
 | 
				
			||||||
 | 
					      intervalMinutes: 30, // Default: every 30 minutes
 | 
				
			||||||
 | 
					      maxRetries: 3,
 | 
				
			||||||
 | 
					      retryDelayMinutes: 5,
 | 
				
			||||||
 | 
					      enabled: true,
 | 
				
			||||||
 | 
					      ...scheduleConfig
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					    this.stats = {
 | 
				
			||||||
 | 
					      lastRun: null,
 | 
				
			||||||
 | 
					      nextRun: null,
 | 
				
			||||||
 | 
					      totalRuns: 0,
 | 
				
			||||||
 | 
					      successfulRuns: 0,
 | 
				
			||||||
 | 
					      failedRuns: 0,
 | 
				
			||||||
 | 
					      totalItemsScraped: 0,
 | 
				
			||||||
 | 
					      totalDuplicates: 0
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  start(): void {
 | 
				
			||||||
 | 
					    if (this.intervalId || !this.scheduleConfig.enabled) {
 | 
				
			||||||
 | 
					      Logger.warn('Scraping scheduler is already running or disabled');
 | 
				
			||||||
 | 
					      return;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Logger.info(`Starting scraping scheduler with ${this.scheduleConfig.intervalMinutes} minute intervals`);
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    // Run immediately on start
 | 
				
			||||||
 | 
					    this.runScrapingCycle();
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    // Schedule recurring runs
 | 
				
			||||||
 | 
					    this.intervalId = setInterval(() => {
 | 
				
			||||||
 | 
					      this.runScrapingCycle();
 | 
				
			||||||
 | 
					    }, this.scheduleConfig.intervalMinutes * 60 * 1000);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    this.updateNextRunTime();
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  stop(): void {
 | 
				
			||||||
 | 
					    if (this.intervalId) {
 | 
				
			||||||
 | 
					      clearInterval(this.intervalId);
 | 
				
			||||||
 | 
					      this.intervalId = null;
 | 
				
			||||||
 | 
					      this.stats.nextRun = null;
 | 
				
			||||||
 | 
					      Logger.info('Scraping scheduler stopped');
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  async runScrapingCycle(): Promise<void> {
 | 
				
			||||||
 | 
					    if (this.isRunning) {
 | 
				
			||||||
 | 
					      Logger.warn('Scraping cycle already in progress, skipping this run');
 | 
				
			||||||
 | 
					      return;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    this.isRunning = true;
 | 
				
			||||||
 | 
					    this.stats.totalRuns++;
 | 
				
			||||||
 | 
					    this.stats.lastRun = new Date();
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    Logger.info(`Starting scraping cycle #${this.stats.totalRuns}`);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    let retryCount = 0;
 | 
				
			||||||
 | 
					    let success = false;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    while (retryCount <= this.scheduleConfig.maxRetries && !success) {
 | 
				
			||||||
 | 
					      try {
 | 
				
			||||||
 | 
					        const configs = ContentScrapingService.createNewsSourceConfigs();
 | 
				
			||||||
 | 
					        const results = await this.contentScrapingService.scrapeFromMultipleSources(configs);
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        // Update statistics
 | 
				
			||||||
 | 
					        let totalSuccess = 0;
 | 
				
			||||||
 | 
					        let totalDuplicates = 0;
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        for (const [sourceName, result] of results) {
 | 
				
			||||||
 | 
					          totalSuccess += result.success;
 | 
				
			||||||
 | 
					          totalDuplicates += result.duplicates;
 | 
				
			||||||
 | 
					          Logger.info(`${sourceName}: ${result.success} new, ${result.duplicates} duplicates, ${result.failed} failed`);
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        this.stats.totalItemsScraped += totalSuccess;
 | 
				
			||||||
 | 
					        this.stats.totalDuplicates += totalDuplicates;
 | 
				
			||||||
 | 
					        this.stats.successfulRuns++;
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        Logger.info(`Scraping cycle completed successfully: ${totalSuccess} new items, ${totalDuplicates} duplicates`);
 | 
				
			||||||
 | 
					        success = true;
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					      } catch (error) {
 | 
				
			||||||
 | 
					        retryCount++;
 | 
				
			||||||
 | 
					        Logger.error(`Scraping cycle failed (attempt ${retryCount}/${this.scheduleConfig.maxRetries + 1}):`, error);
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        if (retryCount <= this.scheduleConfig.maxRetries) {
 | 
				
			||||||
 | 
					          Logger.info(`Retrying in ${this.scheduleConfig.retryDelayMinutes} minutes...`);
 | 
				
			||||||
 | 
					          await this.delay(this.scheduleConfig.retryDelayMinutes * 60 * 1000);
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if (!success) {
 | 
				
			||||||
 | 
					      this.stats.failedRuns++;
 | 
				
			||||||
 | 
					      Logger.error(`Scraping cycle failed after ${this.scheduleConfig.maxRetries + 1} attempts`);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    this.isRunning = false;
 | 
				
			||||||
 | 
					    this.updateNextRunTime();
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  async runSingleSource(sourceName: string): Promise<void> {
 | 
				
			||||||
 | 
					    Logger.info(`Running single source scraping for: ${sourceName}`);
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    try {
 | 
				
			||||||
 | 
					      const configs = ContentScrapingService.createNewsSourceConfigs();
 | 
				
			||||||
 | 
					      const config = configs.find(c => c.name === sourceName);
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      if (!config) {
 | 
				
			||||||
 | 
					        throw new Error(`Source configuration not found: ${sourceName}`);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      const result = await this.contentScrapingService.scrapeFromSource(config);
 | 
				
			||||||
 | 
					      Logger.info(`Single source scraping completed for ${sourceName}: ${result.success} new, ${result.duplicates} duplicates, ${result.failed} failed`);
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					    } catch (error) {
 | 
				
			||||||
 | 
					      Logger.error(`Single source scraping failed for ${sourceName}:`, error);
 | 
				
			||||||
 | 
					      throw error;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  getStats(): ScrapingStats {
 | 
				
			||||||
 | 
					    return { ...this.stats };
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  getConfig(): ScheduleConfig {
 | 
				
			||||||
 | 
					    return { ...this.scheduleConfig };
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  updateConfig(newConfig: Partial<ScheduleConfig>): void {
 | 
				
			||||||
 | 
					    const wasRunning = this.intervalId !== null;
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    if (wasRunning) {
 | 
				
			||||||
 | 
					      this.stop();
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    this.scheduleConfig = { ...this.scheduleConfig, ...newConfig };
 | 
				
			||||||
 | 
					    Logger.info('Scraping scheduler configuration updated', this.scheduleConfig);
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    if (wasRunning && this.scheduleConfig.enabled) {
 | 
				
			||||||
 | 
					      this.start();
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  isSchedulerRunning(): boolean {
 | 
				
			||||||
 | 
					    return this.intervalId !== null;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  isCycleRunning(): boolean {
 | 
				
			||||||
 | 
					    return this.isRunning;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  resetStats(): void {
 | 
				
			||||||
 | 
					    this.stats = {
 | 
				
			||||||
 | 
					      lastRun: null,
 | 
				
			||||||
 | 
					      nextRun: this.stats.nextRun,
 | 
				
			||||||
 | 
					      totalRuns: 0,
 | 
				
			||||||
 | 
					      successfulRuns: 0,
 | 
				
			||||||
 | 
					      failedRuns: 0,
 | 
				
			||||||
 | 
					      totalItemsScraped: 0,
 | 
				
			||||||
 | 
					      totalDuplicates: 0
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					    Logger.info('Scraping scheduler statistics reset');
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  private updateNextRunTime(): void {
 | 
				
			||||||
 | 
					    if (this.intervalId) {
 | 
				
			||||||
 | 
					      this.stats.nextRun = new Date(Date.now() + this.scheduleConfig.intervalMinutes * 60 * 1000);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  private delay(ms: number): Promise<void> {
 | 
				
			||||||
 | 
					    return new Promise(resolve => setTimeout(resolve, ms));
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Graceful shutdown
 | 
				
			||||||
 | 
					  async shutdown(): Promise<void> {
 | 
				
			||||||
 | 
					    Logger.info('Shutting down scraping scheduler...');
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    this.stop();
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    // Wait for current cycle to complete if running
 | 
				
			||||||
 | 
					    while (this.isRunning) {
 | 
				
			||||||
 | 
					      Logger.info('Waiting for current scraping cycle to complete...');
 | 
				
			||||||
 | 
					      await this.delay(1000);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    Logger.info('Scraping scheduler shutdown complete');
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
@@ -1,5 +1,5 @@
 | 
				
			|||||||
import { IFeedRepository } from '../repositories/FeedRepository';
 | 
					import { IFeedRepository } from '../repositories/FeedRepository.js';
 | 
				
			||||||
import { IFeed } from '../types/Feed';
 | 
					import { IFeed } from '../types/Feed.js';
 | 
				
			||||||
 | 
					
 | 
				
			||||||
export class ScrapingService {
 | 
					export class ScrapingService {
 | 
				
			||||||
  constructor(private feedRepository: IFeedRepository) {}
 | 
					  constructor(private feedRepository: IFeedRepository) {}
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										36
									
								
								src/types/NewspaperTypes.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										36
									
								
								src/types/NewspaperTypes.ts
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,36 @@
 | 
				
			|||||||
 | 
					import { NewsSource } from './Feed.js';
 | 
				
			||||||
 | 
					import { IFeed } from './Feed.js';
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**
 | 
				
			||||||
 | 
					 * Interfaz para definir la configuración de extracción de un periódico
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					export interface NewspaperConfig {
 | 
				
			||||||
 | 
					  name: string;
 | 
				
			||||||
 | 
					  source: NewsSource;
 | 
				
			||||||
 | 
					  baseUrl: string;
 | 
				
			||||||
 | 
					  frontPageUrl: string;
 | 
				
			||||||
 | 
					  selectors: NewsSelectors;
 | 
				
			||||||
 | 
					  enabled: boolean;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**
 | 
				
			||||||
 | 
					 * Selectores CSS para extraer elementos específicos de cada periódico
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					export interface NewsSelectors {
 | 
				
			||||||
 | 
					  articleLinks: string;
 | 
				
			||||||
 | 
					  titleSelector?: string;
 | 
				
			||||||
 | 
					  descriptionSelector?: string;
 | 
				
			||||||
 | 
					  dateSelector?: string;
 | 
				
			||||||
 | 
					  imageSelector?: string;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**
 | 
				
			||||||
 | 
					 * Resultado del proceso de scraping
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					export interface ScrapingResult {
 | 
				
			||||||
 | 
					  success: number;
 | 
				
			||||||
 | 
					  failed: number;
 | 
				
			||||||
 | 
					  duplicates: number;
 | 
				
			||||||
 | 
					  items: (IFeed | null)[];
 | 
				
			||||||
 | 
					  errors: string[];
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
							
								
								
									
										143
									
								
								src/utils/WebScraper.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										143
									
								
								src/utils/WebScraper.ts
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,143 @@
 | 
				
			|||||||
 | 
					import { IFeed, NewsSource } from '../types/Feed.js';
 | 
				
			||||||
 | 
					import { Logger } from './logger.js';
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					interface ScrapedData {
 | 
				
			||||||
 | 
					  title: string;
 | 
				
			||||||
 | 
					  description: string;
 | 
				
			||||||
 | 
					  url: string;
 | 
				
			||||||
 | 
					  publishedAt: Date;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export class WebScraper {
 | 
				
			||||||
 | 
					  private userAgent = 'Mozilla/5.0 (compatible; DailyTrends/1.0)';
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  async scrapeUrl(url: string): Promise<ScrapedData | null> {
 | 
				
			||||||
 | 
					    try {
 | 
				
			||||||
 | 
					      const response = await fetch(url, {
 | 
				
			||||||
 | 
					        headers: {
 | 
				
			||||||
 | 
					          'User-Agent': this.userAgent,
 | 
				
			||||||
 | 
					          'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					      });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      if (!response.ok) {
 | 
				
			||||||
 | 
					        Logger.error(`Failed to fetch ${url}: ${response.status} ${response.statusText}`);
 | 
				
			||||||
 | 
					        return null;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const html = await response.text();
 | 
				
			||||||
 | 
					      return this.parseHtml(html, url);
 | 
				
			||||||
 | 
					    } catch (error) {
 | 
				
			||||||
 | 
					      Logger.error(`Error scraping ${url}:`, error);
 | 
				
			||||||
 | 
					      return null;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  private parseHtml(html: string, url: string): ScrapedData | null {
 | 
				
			||||||
 | 
					    try {
 | 
				
			||||||
 | 
					      // Extract title from <title> tag or Open Graph
 | 
				
			||||||
 | 
					      const title = this.extractTitle(html);
 | 
				
			||||||
 | 
					      if (!title) {
 | 
				
			||||||
 | 
					        Logger.warn(`No title found for ${url}`);
 | 
				
			||||||
 | 
					        return null;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      // Extract description from meta tags
 | 
				
			||||||
 | 
					      const description = this.extractDescription(html);
 | 
				
			||||||
 | 
					      if (!description) {
 | 
				
			||||||
 | 
					        Logger.warn(`No description found for ${url}`);
 | 
				
			||||||
 | 
					        return null;
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      // Extract published date
 | 
				
			||||||
 | 
					      const publishedAt = this.extractPublishedDate(html);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      return {
 | 
				
			||||||
 | 
					        title: title.trim(),
 | 
				
			||||||
 | 
					        description: description.trim(),
 | 
				
			||||||
 | 
					        url,
 | 
				
			||||||
 | 
					        publishedAt
 | 
				
			||||||
 | 
					      };
 | 
				
			||||||
 | 
					    } catch (error) {
 | 
				
			||||||
 | 
					      Logger.error(`Error parsing HTML for ${url}:`, error);
 | 
				
			||||||
 | 
					      return null;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  private extractTitle(html: string): string | null {
 | 
				
			||||||
 | 
					    // Try Open Graph title first
 | 
				
			||||||
 | 
					    const ogTitleMatch = html.match(/<meta\s+property=["']og:title["']\s+content=["']([^"']+)["']/i);
 | 
				
			||||||
 | 
					    if (ogTitleMatch) {
 | 
				
			||||||
 | 
					      return ogTitleMatch[1];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Try Twitter title
 | 
				
			||||||
 | 
					    const twitterTitleMatch = html.match(/<meta\s+name=["']twitter:title["']\s+content=["']([^"']+)["']/i);
 | 
				
			||||||
 | 
					    if (twitterTitleMatch) {
 | 
				
			||||||
 | 
					      return twitterTitleMatch[1];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Fall back to <title> tag
 | 
				
			||||||
 | 
					    const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
 | 
				
			||||||
 | 
					    if (titleMatch) {
 | 
				
			||||||
 | 
					      return titleMatch[1];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return null;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  private extractDescription(html: string): string | null {
 | 
				
			||||||
 | 
					    // Try Open Graph description first
 | 
				
			||||||
 | 
					    const ogDescMatch = html.match(/<meta\s+property=["']og:description["']\s+content=["']([^"']+)["']/i);
 | 
				
			||||||
 | 
					    if (ogDescMatch) {
 | 
				
			||||||
 | 
					      return ogDescMatch[1];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Try Twitter description
 | 
				
			||||||
 | 
					    const twitterDescMatch = html.match(/<meta\s+name=["']twitter:description["']\s+content=["']([^"']+)["']/i);
 | 
				
			||||||
 | 
					    if (twitterDescMatch) {
 | 
				
			||||||
 | 
					      return twitterDescMatch[1];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Try meta description
 | 
				
			||||||
 | 
					    const metaDescMatch = html.match(/<meta\s+name=["']description["']\s+content=["']([^"']+)["']/i);
 | 
				
			||||||
 | 
					    if (metaDescMatch) {
 | 
				
			||||||
 | 
					      return metaDescMatch[1];
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return null;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  private extractPublishedDate(html: string): Date {
 | 
				
			||||||
 | 
					    // Try various date formats
 | 
				
			||||||
 | 
					    const datePatterns = [
 | 
				
			||||||
 | 
					      /<meta\s+property=["']article:published_time["']\s+content=["']([^"']+)["']/i,
 | 
				
			||||||
 | 
					      /<meta\s+name=["']pubdate["']\s+content=["']([^"']+)["']/i,
 | 
				
			||||||
 | 
					      /<time[^>]+datetime=["']([^"']+)["']/i
 | 
				
			||||||
 | 
					    ];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for (const pattern of datePatterns) {
 | 
				
			||||||
 | 
					      const match = html.match(pattern);
 | 
				
			||||||
 | 
					      if (match) {
 | 
				
			||||||
 | 
					        const date = new Date(match[1]);
 | 
				
			||||||
 | 
					        if (!isNaN(date.getTime())) {
 | 
				
			||||||
 | 
					          return date;
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Default to current date if no published date found
 | 
				
			||||||
 | 
					    return new Date();
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  convertToFeedData(scrapedData: ScrapedData, source: NewsSource): Omit<IFeed, '_id' | 'createdAt' | 'updatedAt'> {
 | 
				
			||||||
 | 
					    return {
 | 
				
			||||||
 | 
					      title: scrapedData.title,
 | 
				
			||||||
 | 
					      description: scrapedData.description,
 | 
				
			||||||
 | 
					      url: scrapedData.url,
 | 
				
			||||||
 | 
					      source,
 | 
				
			||||||
 | 
					      publishedAt: scrapedData.publishedAt,
 | 
				
			||||||
 | 
					      isManual: false
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
		Reference in New Issue
	
	Block a user