ContentScrapingService
This commit is contained in:
		
							
								
								
									
										259
									
								
								src/__tests__/ContentScrapingService.test.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										259
									
								
								src/__tests__/ContentScrapingService.test.ts
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,259 @@
 | 
				
			|||||||
 | 
					import { ContentScrapingService } from '../services/ContentScrapingService';
 | 
				
			||||||
 | 
					import { WebScraper } from '../utils/WebScraper';
 | 
				
			||||||
 | 
					import { ScrapingService } from '../services/ScrapingService';
 | 
				
			||||||
 | 
					import { IFeedRepository } from '../repositories/FeedRepository';
 | 
				
			||||||
 | 
					import { NewsSource } from '../types/Feed';
 | 
				
			||||||
 | 
					import { Logger } from '../utils/logger';
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					// Mock dependencies
 | 
				
			||||||
 | 
					jest.mock('../utils/WebScraper');
 | 
				
			||||||
 | 
					jest.mock('../services/ScrapingService');
 | 
				
			||||||
 | 
					jest.mock('../utils/logger');
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					describe('ContentScrapingService', () => {
 | 
				
			||||||
 | 
					  let contentScrapingService: ContentScrapingService;
 | 
				
			||||||
 | 
					  let mockFeedRepository: jest.Mocked<IFeedRepository>;
 | 
				
			||||||
 | 
					  let mockWebScraper: jest.Mocked<WebScraper>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  let mockScrapingService: jest.Mocked<ScrapingService>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  beforeEach(() => {
 | 
				
			||||||
 | 
					    jest.clearAllMocks();
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    mockFeedRepository = {
 | 
				
			||||||
 | 
					      create: jest.fn(),
 | 
				
			||||||
 | 
					      findAll: jest.fn(),
 | 
				
			||||||
 | 
					      findById: jest.fn(),
 | 
				
			||||||
 | 
					      findByUrl: jest.fn(),
 | 
				
			||||||
 | 
					      findBySource: jest.fn(),
 | 
				
			||||||
 | 
					      findTodaysFrontPage: jest.fn(),
 | 
				
			||||||
 | 
					      update: jest.fn(),
 | 
				
			||||||
 | 
					      delete: jest.fn(),
 | 
				
			||||||
 | 
					      deleteMany: jest.fn(),
 | 
				
			||||||
 | 
					      count: jest.fn(),
 | 
				
			||||||
 | 
					      exists: jest.fn()
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    mockWebScraper = new WebScraper() as jest.Mocked<WebScraper>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    mockScrapingService = new ScrapingService(mockFeedRepository) as jest.Mocked<ScrapingService>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Mock constructor calls
 | 
				
			||||||
 | 
					    (WebScraper as jest.MockedClass<typeof WebScraper>).mockImplementation(() => mockWebScraper);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    (ScrapingService as jest.MockedClass<typeof ScrapingService>).mockImplementation(() => mockScrapingService);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    contentScrapingService = new ContentScrapingService(mockFeedRepository);
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  describe('scrapeFromWebUrls', () => {
 | 
				
			||||||
 | 
					    test('should successfully scrape from web URLs', async () => {
 | 
				
			||||||
 | 
					      const mockScrapedData = [
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					          title: 'Web Article 1',
 | 
				
			||||||
 | 
					          description: 'Web Description 1',
 | 
				
			||||||
 | 
					          url: 'https://example.com/web1',
 | 
				
			||||||
 | 
					          publishedAt: new Date()
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					          title: 'Web Article 2',
 | 
				
			||||||
 | 
					          description: 'Web Description 2',
 | 
				
			||||||
 | 
					          url: 'https://example.com/web2',
 | 
				
			||||||
 | 
					          publishedAt: new Date()
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					      ];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const mockFeedData = mockScrapedData.map(data => ({
 | 
				
			||||||
 | 
					        ...data,
 | 
				
			||||||
 | 
					        source: NewsSource.EL_MUNDO,
 | 
				
			||||||
 | 
					        isManual: false
 | 
				
			||||||
 | 
					      }));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const mockResults = [
 | 
				
			||||||
 | 
					        { _id: '1', ...mockFeedData[0] },
 | 
				
			||||||
 | 
					        { _id: '2', ...mockFeedData[1] }
 | 
				
			||||||
 | 
					      ];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      mockWebScraper.scrapeUrl
 | 
				
			||||||
 | 
					        .mockResolvedValueOnce(mockScrapedData[0])
 | 
				
			||||||
 | 
					        .mockResolvedValueOnce(mockScrapedData[1]);
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      mockWebScraper.convertToFeedData
 | 
				
			||||||
 | 
					        .mockReturnValueOnce(mockFeedData[0])
 | 
				
			||||||
 | 
					        .mockReturnValueOnce(mockFeedData[1]);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      mockScrapingService.processFeedBatch.mockResolvedValue(mockResults);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const urls = ['https://example.com/web1', 'https://example.com/web2'];
 | 
				
			||||||
 | 
					      const result = await contentScrapingService.scrapeFromWebUrls(urls, NewsSource.EL_MUNDO);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      expect(mockWebScraper.scrapeUrl).toHaveBeenCalledTimes(2);
 | 
				
			||||||
 | 
					      expect(mockWebScraper.convertToFeedData).toHaveBeenCalledTimes(2);
 | 
				
			||||||
 | 
					      expect(mockScrapingService.processFeedBatch).toHaveBeenCalledWith(mockFeedData);
 | 
				
			||||||
 | 
					      expect(result).toEqual({
 | 
				
			||||||
 | 
					        success: 2,
 | 
				
			||||||
 | 
					        failed: 0,
 | 
				
			||||||
 | 
					        duplicates: 0,
 | 
				
			||||||
 | 
					        items: mockResults
 | 
				
			||||||
 | 
					      });
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    test('should handle failed web scraping', async () => {
 | 
				
			||||||
 | 
					      mockWebScraper.scrapeUrl
 | 
				
			||||||
 | 
					        .mockResolvedValueOnce(null)
 | 
				
			||||||
 | 
					        .mockRejectedValueOnce(new Error('Scraping failed'));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const urls = ['https://example.com/fail1', 'https://example.com/fail2'];
 | 
				
			||||||
 | 
					      const result = await contentScrapingService.scrapeFromWebUrls(urls, NewsSource.EL_MUNDO);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      expect(result).toEqual({
 | 
				
			||||||
 | 
					        success: 0,
 | 
				
			||||||
 | 
					        failed: 2,
 | 
				
			||||||
 | 
					        duplicates: 0,
 | 
				
			||||||
 | 
					        items: []
 | 
				
			||||||
 | 
					      });
 | 
				
			||||||
 | 
					      expect(mockScrapingService.processFeedBatch).not.toHaveBeenCalled();
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  describe('scrapeFromSource', () => {
 | 
				
			||||||
 | 
					    test('should scrape from web URLs', async () => {
 | 
				
			||||||
 | 
					      const config = {
 | 
				
			||||||
 | 
					        name: 'Test Source',
 | 
				
			||||||
 | 
					        source: NewsSource.EL_PAIS,
 | 
				
			||||||
 | 
					        webUrls: ['https://example.com/web1'],
 | 
				
			||||||
 | 
					        enabled: true
 | 
				
			||||||
 | 
					      };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const mockScrapedData = {
 | 
				
			||||||
 | 
					        title: 'Web Article',
 | 
				
			||||||
 | 
					        description: 'Web Description',
 | 
				
			||||||
 | 
					        url: 'https://example.com/web1',
 | 
				
			||||||
 | 
					        publishedAt: new Date()
 | 
				
			||||||
 | 
					      };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const mockWebFeedData = {
 | 
				
			||||||
 | 
					        ...mockScrapedData,
 | 
				
			||||||
 | 
					        source: NewsSource.EL_PAIS,
 | 
				
			||||||
 | 
					        isManual: false
 | 
				
			||||||
 | 
					      };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      // Mock web scraping
 | 
				
			||||||
 | 
					      mockWebScraper.scrapeUrl.mockResolvedValue(mockScrapedData);
 | 
				
			||||||
 | 
					      mockWebScraper.convertToFeedData.mockReturnValue(mockWebFeedData);
 | 
				
			||||||
 | 
					      mockScrapingService.processFeedBatch.mockResolvedValue([{ _id: '1', ...mockWebFeedData }]);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const result = await contentScrapingService.scrapeFromSource(config);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      expect(result).toEqual({
 | 
				
			||||||
 | 
					        success: 1,
 | 
				
			||||||
 | 
					        failed: 0,
 | 
				
			||||||
 | 
					        duplicates: 0,
 | 
				
			||||||
 | 
					        items: [{ _id: '1', ...mockWebFeedData }]
 | 
				
			||||||
 | 
					      });
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    test('should skip disabled sources', async () => {
 | 
				
			||||||
 | 
					      const config = {
 | 
				
			||||||
 | 
					        name: 'Disabled Source',
 | 
				
			||||||
 | 
					        source: NewsSource.EL_PAIS,
 | 
				
			||||||
 | 
					        webUrls: ['https://example.com/web1'],
 | 
				
			||||||
 | 
					        enabled: false
 | 
				
			||||||
 | 
					      };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const result = await contentScrapingService.scrapeFromSource(config);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      expect(result).toEqual({
 | 
				
			||||||
 | 
					        success: 0,
 | 
				
			||||||
 | 
					        failed: 0,
 | 
				
			||||||
 | 
					        duplicates: 0,
 | 
				
			||||||
 | 
					        items: []
 | 
				
			||||||
 | 
					      });
 | 
				
			||||||
 | 
					      expect(mockWebScraper.scrapeUrl).not.toHaveBeenCalled();
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  describe('scrapeFromMultipleSources', () => {
 | 
				
			||||||
 | 
					    test('should scrape from multiple sources', async () => {
 | 
				
			||||||
 | 
					      const configs = [
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					          name: 'Source 1',
 | 
				
			||||||
 | 
					          source: NewsSource.EL_PAIS,
 | 
				
			||||||
 | 
					          webUrls: ['https://example.com/web1'],
 | 
				
			||||||
 | 
					          enabled: true
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					          name: 'Source 2',
 | 
				
			||||||
 | 
					          source: NewsSource.EL_MUNDO,
 | 
				
			||||||
 | 
					          webUrls: ['https://example.com/web2'],
 | 
				
			||||||
 | 
					          enabled: true
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					      ];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const mockScrapedData1 = {
 | 
				
			||||||
 | 
					        title: 'Article 1',
 | 
				
			||||||
 | 
					        description: 'Description 1',
 | 
				
			||||||
 | 
					        url: 'https://example.com/web1',
 | 
				
			||||||
 | 
					        publishedAt: new Date()
 | 
				
			||||||
 | 
					      };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const mockScrapedData2 = {
 | 
				
			||||||
 | 
					        title: 'Article 2',
 | 
				
			||||||
 | 
					        description: 'Description 2',
 | 
				
			||||||
 | 
					        url: 'https://example.com/web2',
 | 
				
			||||||
 | 
					        publishedAt: new Date()
 | 
				
			||||||
 | 
					      };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const mockFeedData1 = { ...mockScrapedData1, source: NewsSource.EL_PAIS, isManual: false };
 | 
				
			||||||
 | 
					      const mockFeedData2 = { ...mockScrapedData2, source: NewsSource.EL_MUNDO, isManual: false };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      mockWebScraper.scrapeUrl
 | 
				
			||||||
 | 
					        .mockResolvedValueOnce(mockScrapedData1)
 | 
				
			||||||
 | 
					        .mockResolvedValueOnce(mockScrapedData2);
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
 | 
					      mockWebScraper.convertToFeedData
 | 
				
			||||||
 | 
					        .mockReturnValueOnce(mockFeedData1)
 | 
				
			||||||
 | 
					        .mockReturnValueOnce(mockFeedData2);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      mockScrapingService.processFeedBatch
 | 
				
			||||||
 | 
					        .mockResolvedValueOnce([{ _id: '1', ...mockFeedData1 }])
 | 
				
			||||||
 | 
					        .mockResolvedValueOnce([{ _id: '2', ...mockFeedData2 }]);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      const results = await contentScrapingService.scrapeFromMultipleSources(configs);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      expect(results.size).toBe(2);
 | 
				
			||||||
 | 
					      expect(results.get('Source 1')).toEqual({
 | 
				
			||||||
 | 
					        success: 1,
 | 
				
			||||||
 | 
					        failed: 0,
 | 
				
			||||||
 | 
					        duplicates: 0,
 | 
				
			||||||
 | 
					        items: [{ _id: '1', ...mockFeedData1 }]
 | 
				
			||||||
 | 
					      });
 | 
				
			||||||
 | 
					      expect(results.get('Source 2')).toEqual({
 | 
				
			||||||
 | 
					        success: 1,
 | 
				
			||||||
 | 
					        failed: 0,
 | 
				
			||||||
 | 
					        duplicates: 0,
 | 
				
			||||||
 | 
					        items: [{ _id: '2', ...mockFeedData2 }]
 | 
				
			||||||
 | 
					      });
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  describe('createNewsSourceConfigs', () => {
 | 
				
			||||||
 | 
					    test('should create default news source configurations', () => {
 | 
				
			||||||
 | 
					      const configs = ContentScrapingService.createNewsSourceConfigs();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      expect(configs).toHaveLength(2);
 | 
				
			||||||
 | 
					      expect(configs[0]).toEqual({
 | 
				
			||||||
 | 
					        name: 'El País',
 | 
				
			||||||
 | 
					        source: NewsSource.EL_PAIS,
 | 
				
			||||||
 | 
					        enabled: true
 | 
				
			||||||
 | 
					      });
 | 
				
			||||||
 | 
					      expect(configs[1]).toEqual({
 | 
				
			||||||
 | 
					        name: 'El Mundo',
 | 
				
			||||||
 | 
					        source: NewsSource.EL_MUNDO,
 | 
				
			||||||
 | 
					        enabled: true
 | 
				
			||||||
 | 
					      });
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					  });
 | 
				
			||||||
 | 
					});
 | 
				
			||||||
							
								
								
									
										156
									
								
								src/services/ContentScrapingService.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										156
									
								
								src/services/ContentScrapingService.ts
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,156 @@
 | 
				
			|||||||
 | 
					import { WebScraper } from '../utils/WebScraper.js';
 | 
				
			||||||
 | 
					import { ScrapingService } from './ScrapingService.js';
 | 
				
			||||||
 | 
					import { IFeed, NewsSource } from '../types/Feed.js';
 | 
				
			||||||
 | 
					import { IFeedRepository } from '../repositories/FeedRepository.js';
 | 
				
			||||||
 | 
					import { Logger } from '../utils/logger.js';
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					interface ScrapingResult {
 | 
				
			||||||
 | 
					  success: number;
 | 
				
			||||||
 | 
					  failed: number;
 | 
				
			||||||
 | 
					  duplicates: number;
 | 
				
			||||||
 | 
					  items: (IFeed | null)[];
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					interface NewsSourceConfig {
 | 
				
			||||||
 | 
					  name: string;
 | 
				
			||||||
 | 
					  source: NewsSource;
 | 
				
			||||||
 | 
					  webUrls?: string[];
 | 
				
			||||||
 | 
					  enabled: boolean;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					export class ContentScrapingService {
 | 
				
			||||||
 | 
					  private webScraper: WebScraper;
 | 
				
			||||||
 | 
					  private scrapingService: ScrapingService;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  constructor(feedRepository: IFeedRepository) {
 | 
				
			||||||
 | 
					    this.webScraper = new WebScraper();
 | 
				
			||||||
 | 
					    this.scrapingService = new ScrapingService(feedRepository);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  async scrapeFromWebUrls(urls: string[], source: NewsSource): Promise<ScrapingResult> {
 | 
				
			||||||
 | 
					    Logger.info(`Starting web scraping from ${urls.length} URLs for ${source}`);
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    const feedItems: Omit<IFeed, '_id' | 'createdAt' | 'updatedAt'>[] = [];
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    for (const url of urls) {
 | 
				
			||||||
 | 
					      try {
 | 
				
			||||||
 | 
					        const scrapedData = await this.webScraper.scrapeUrl(url);
 | 
				
			||||||
 | 
					        if (scrapedData) {
 | 
				
			||||||
 | 
					          const feedData = this.webScraper.convertToFeedData(scrapedData, source);
 | 
				
			||||||
 | 
					          feedItems.push(feedData);
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					      } catch (error) {
 | 
				
			||||||
 | 
					        Logger.error(`Error scraping URL ${url}:`, error);
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if (feedItems.length === 0) {
 | 
				
			||||||
 | 
					      Logger.warn(`No items scraped from web URLs`);
 | 
				
			||||||
 | 
					      return { success: 0, failed: urls.length, duplicates: 0, items: [] };
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    const results = await this.scrapingService.processFeedBatch(feedItems);
 | 
				
			||||||
 | 
					    return this.analyzeResults(results);
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  async scrapeFromSource(config: NewsSourceConfig): Promise<ScrapingResult> {
 | 
				
			||||||
 | 
					    if (!config.enabled) {
 | 
				
			||||||
 | 
					      Logger.info(`Skipping disabled source: ${config.name}`);
 | 
				
			||||||
 | 
					      return { success: 0, failed: 0, duplicates: 0, items: [] };
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Logger.info(`Starting content scraping for source: ${config.name}`);
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    let totalResult: ScrapingResult = { success: 0, failed: 0, duplicates: 0, items: [] };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Scrape from web URLs if available
 | 
				
			||||||
 | 
					    if (config.webUrls && config.webUrls.length > 0) {
 | 
				
			||||||
 | 
					      const webResult = await this.scrapeFromWebUrls(config.webUrls, config.source);
 | 
				
			||||||
 | 
					      totalResult = this.mergeResults(totalResult, webResult);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Logger.info(`Completed scraping for ${config.name}: ${totalResult.success} success, ${totalResult.failed} failed, ${totalResult.duplicates} duplicates`);
 | 
				
			||||||
 | 
					    return totalResult;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  async scrapeFromMultipleSources(configs: NewsSourceConfig[]): Promise<Map<string, ScrapingResult>> {
 | 
				
			||||||
 | 
					    Logger.info(`Starting batch scraping from ${configs.length} sources`);
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    const results = new Map<string, ScrapingResult>();
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    for (const config of configs) {
 | 
				
			||||||
 | 
					      try {
 | 
				
			||||||
 | 
					        const result = await this.scrapeFromSource(config);
 | 
				
			||||||
 | 
					        results.set(config.name, result);
 | 
				
			||||||
 | 
					      } catch (error) {
 | 
				
			||||||
 | 
					        Logger.error(`Error scraping source ${config.name}:`, error);
 | 
				
			||||||
 | 
					        results.set(config.name, { success: 0, failed: 1, duplicates: 0, items: [] });
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    const totalStats = this.calculateTotalStats(results);
 | 
				
			||||||
 | 
					    Logger.info(`Batch scraping completed: ${totalStats.success} total success, ${totalStats.failed} total failed, ${totalStats.duplicates} total duplicates`);
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    return results;
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  private analyzeResults(results: (IFeed | null)[]): ScrapingResult {
 | 
				
			||||||
 | 
					    const success = results.filter(item => item !== null).length;
 | 
				
			||||||
 | 
					    const duplicates = results.filter(item => item === null).length;
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    return {
 | 
				
			||||||
 | 
					      success,
 | 
				
			||||||
 | 
					      failed: 0, // processFeedBatch doesn't fail individual items, it throws on repository errors
 | 
				
			||||||
 | 
					      duplicates,
 | 
				
			||||||
 | 
					      items: results
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  private mergeResults(result1: ScrapingResult, result2: ScrapingResult): ScrapingResult {
 | 
				
			||||||
 | 
					    return {
 | 
				
			||||||
 | 
					      success: result1.success + result2.success,
 | 
				
			||||||
 | 
					      failed: result1.failed + result2.failed,
 | 
				
			||||||
 | 
					      duplicates: result1.duplicates + result2.duplicates,
 | 
				
			||||||
 | 
					      items: [...result1.items, ...result2.items]
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  private calculateTotalStats(results: Map<string, ScrapingResult>): ScrapingResult {
 | 
				
			||||||
 | 
					    let totalSuccess = 0;
 | 
				
			||||||
 | 
					    let totalFailed = 0;
 | 
				
			||||||
 | 
					    let totalDuplicates = 0;
 | 
				
			||||||
 | 
					    const allItems: (IFeed | null)[] = [];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for (const result of results.values()) {
 | 
				
			||||||
 | 
					      totalSuccess += result.success;
 | 
				
			||||||
 | 
					      totalFailed += result.failed;
 | 
				
			||||||
 | 
					      totalDuplicates += result.duplicates;
 | 
				
			||||||
 | 
					      allItems.push(...result.items);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return {
 | 
				
			||||||
 | 
					      success: totalSuccess,
 | 
				
			||||||
 | 
					      failed: totalFailed,
 | 
				
			||||||
 | 
					      duplicates: totalDuplicates,
 | 
				
			||||||
 | 
					      items: allItems
 | 
				
			||||||
 | 
					    };
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Utility method to create common news source configurations
 | 
				
			||||||
 | 
					  static createNewsSourceConfigs(): NewsSourceConfig[] {
 | 
				
			||||||
 | 
					    return [
 | 
				
			||||||
 | 
					      {
 | 
				
			||||||
 | 
					        name: 'El País',
 | 
				
			||||||
 | 
					        source: NewsSource.EL_PAIS,
 | 
				
			||||||
 | 
					        enabled: true
 | 
				
			||||||
 | 
					      },
 | 
				
			||||||
 | 
					      {
 | 
				
			||||||
 | 
					        name: 'El Mundo',
 | 
				
			||||||
 | 
					        source: NewsSource.EL_MUNDO,
 | 
				
			||||||
 | 
					        enabled: true
 | 
				
			||||||
 | 
					      }
 | 
				
			||||||
 | 
					    ];
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
		Reference in New Issue
	
	Block a user