From 891b1e478d6576e31b7798ae8c5219222c9c9069 Mon Sep 17 00:00:00 2001 From: albert Date: Tue, 29 Jul 2025 12:45:05 +0200 Subject: [PATCH] ContentScrapingService --- src/__tests__/ContentScrapingService.test.ts | 259 +++++++++++++++++++ src/services/ContentScrapingService.ts | 156 +++++++++++ 2 files changed, 415 insertions(+) create mode 100644 src/__tests__/ContentScrapingService.test.ts create mode 100644 src/services/ContentScrapingService.ts diff --git a/src/__tests__/ContentScrapingService.test.ts b/src/__tests__/ContentScrapingService.test.ts new file mode 100644 index 0000000..45e7cf0 --- /dev/null +++ b/src/__tests__/ContentScrapingService.test.ts @@ -0,0 +1,259 @@ +import { ContentScrapingService } from '../services/ContentScrapingService'; +import { WebScraper } from '../utils/WebScraper'; +import { ScrapingService } from '../services/ScrapingService'; +import { IFeedRepository } from '../repositories/FeedRepository'; +import { NewsSource } from '../types/Feed'; +import { Logger } from '../utils/logger'; + +// Mock dependencies +jest.mock('../utils/WebScraper'); +jest.mock('../services/ScrapingService'); +jest.mock('../utils/logger'); + +describe('ContentScrapingService', () => { + let contentScrapingService: ContentScrapingService; + let mockFeedRepository: jest.Mocked; + let mockWebScraper: jest.Mocked; + + let mockScrapingService: jest.Mocked; + + beforeEach(() => { + jest.clearAllMocks(); + + mockFeedRepository = { + create: jest.fn(), + findAll: jest.fn(), + findById: jest.fn(), + findByUrl: jest.fn(), + findBySource: jest.fn(), + findTodaysFrontPage: jest.fn(), + update: jest.fn(), + delete: jest.fn(), + deleteMany: jest.fn(), + count: jest.fn(), + exists: jest.fn() + }; + + mockWebScraper = new WebScraper() as jest.Mocked; + + mockScrapingService = new ScrapingService(mockFeedRepository) as jest.Mocked; + + // Mock constructor calls + (WebScraper as jest.MockedClass).mockImplementation(() => mockWebScraper); + + (ScrapingService as jest.MockedClass).mockImplementation(() => mockScrapingService); + + contentScrapingService = new ContentScrapingService(mockFeedRepository); + }); + + + + describe('scrapeFromWebUrls', () => { + test('should successfully scrape from web URLs', async () => { + const mockScrapedData = [ + { + title: 'Web Article 1', + description: 'Web Description 1', + url: 'https://example.com/web1', + publishedAt: new Date() + }, + { + title: 'Web Article 2', + description: 'Web Description 2', + url: 'https://example.com/web2', + publishedAt: new Date() + } + ]; + + const mockFeedData = mockScrapedData.map(data => ({ + ...data, + source: NewsSource.EL_MUNDO, + isManual: false + })); + + const mockResults = [ + { _id: '1', ...mockFeedData[0] }, + { _id: '2', ...mockFeedData[1] } + ]; + + mockWebScraper.scrapeUrl + .mockResolvedValueOnce(mockScrapedData[0]) + .mockResolvedValueOnce(mockScrapedData[1]); + + mockWebScraper.convertToFeedData + .mockReturnValueOnce(mockFeedData[0]) + .mockReturnValueOnce(mockFeedData[1]); + + mockScrapingService.processFeedBatch.mockResolvedValue(mockResults); + + const urls = ['https://example.com/web1', 'https://example.com/web2']; + const result = await contentScrapingService.scrapeFromWebUrls(urls, NewsSource.EL_MUNDO); + + expect(mockWebScraper.scrapeUrl).toHaveBeenCalledTimes(2); + expect(mockWebScraper.convertToFeedData).toHaveBeenCalledTimes(2); + expect(mockScrapingService.processFeedBatch).toHaveBeenCalledWith(mockFeedData); + expect(result).toEqual({ + success: 2, + failed: 0, + duplicates: 0, + items: mockResults + }); + }); + + test('should handle failed web scraping', async () => { + mockWebScraper.scrapeUrl + .mockResolvedValueOnce(null) + .mockRejectedValueOnce(new Error('Scraping failed')); + + const urls = ['https://example.com/fail1', 'https://example.com/fail2']; + const result = await contentScrapingService.scrapeFromWebUrls(urls, NewsSource.EL_MUNDO); + + expect(result).toEqual({ + success: 0, + failed: 2, + duplicates: 0, + items: [] + }); + expect(mockScrapingService.processFeedBatch).not.toHaveBeenCalled(); + }); + }); + + describe('scrapeFromSource', () => { + test('should scrape from web URLs', async () => { + const config = { + name: 'Test Source', + source: NewsSource.EL_PAIS, + webUrls: ['https://example.com/web1'], + enabled: true + }; + + const mockScrapedData = { + title: 'Web Article', + description: 'Web Description', + url: 'https://example.com/web1', + publishedAt: new Date() + }; + + const mockWebFeedData = { + ...mockScrapedData, + source: NewsSource.EL_PAIS, + isManual: false + }; + + // Mock web scraping + mockWebScraper.scrapeUrl.mockResolvedValue(mockScrapedData); + mockWebScraper.convertToFeedData.mockReturnValue(mockWebFeedData); + mockScrapingService.processFeedBatch.mockResolvedValue([{ _id: '1', ...mockWebFeedData }]); + + const result = await contentScrapingService.scrapeFromSource(config); + + expect(result).toEqual({ + success: 1, + failed: 0, + duplicates: 0, + items: [{ _id: '1', ...mockWebFeedData }] + }); + }); + + test('should skip disabled sources', async () => { + const config = { + name: 'Disabled Source', + source: NewsSource.EL_PAIS, + webUrls: ['https://example.com/web1'], + enabled: false + }; + + const result = await contentScrapingService.scrapeFromSource(config); + + expect(result).toEqual({ + success: 0, + failed: 0, + duplicates: 0, + items: [] + }); + expect(mockWebScraper.scrapeUrl).not.toHaveBeenCalled(); + }); + }); + + describe('scrapeFromMultipleSources', () => { + test('should scrape from multiple sources', async () => { + const configs = [ + { + name: 'Source 1', + source: NewsSource.EL_PAIS, + webUrls: ['https://example.com/web1'], + enabled: true + }, + { + name: 'Source 2', + source: NewsSource.EL_MUNDO, + webUrls: ['https://example.com/web2'], + enabled: true + } + ]; + + const mockScrapedData1 = { + title: 'Article 1', + description: 'Description 1', + url: 'https://example.com/web1', + publishedAt: new Date() + }; + + const mockScrapedData2 = { + title: 'Article 2', + description: 'Description 2', + url: 'https://example.com/web2', + publishedAt: new Date() + }; + + const mockFeedData1 = { ...mockScrapedData1, source: NewsSource.EL_PAIS, isManual: false }; + const mockFeedData2 = { ...mockScrapedData2, source: NewsSource.EL_MUNDO, isManual: false }; + + mockWebScraper.scrapeUrl + .mockResolvedValueOnce(mockScrapedData1) + .mockResolvedValueOnce(mockScrapedData2); + + mockWebScraper.convertToFeedData + .mockReturnValueOnce(mockFeedData1) + .mockReturnValueOnce(mockFeedData2); + + mockScrapingService.processFeedBatch + .mockResolvedValueOnce([{ _id: '1', ...mockFeedData1 }]) + .mockResolvedValueOnce([{ _id: '2', ...mockFeedData2 }]); + + const results = await contentScrapingService.scrapeFromMultipleSources(configs); + + expect(results.size).toBe(2); + expect(results.get('Source 1')).toEqual({ + success: 1, + failed: 0, + duplicates: 0, + items: [{ _id: '1', ...mockFeedData1 }] + }); + expect(results.get('Source 2')).toEqual({ + success: 1, + failed: 0, + duplicates: 0, + items: [{ _id: '2', ...mockFeedData2 }] + }); + }); + }); + + describe('createNewsSourceConfigs', () => { + test('should create default news source configurations', () => { + const configs = ContentScrapingService.createNewsSourceConfigs(); + + expect(configs).toHaveLength(2); + expect(configs[0]).toEqual({ + name: 'El País', + source: NewsSource.EL_PAIS, + enabled: true + }); + expect(configs[1]).toEqual({ + name: 'El Mundo', + source: NewsSource.EL_MUNDO, + enabled: true + }); + }); + }); +}); \ No newline at end of file diff --git a/src/services/ContentScrapingService.ts b/src/services/ContentScrapingService.ts new file mode 100644 index 0000000..05a2352 --- /dev/null +++ b/src/services/ContentScrapingService.ts @@ -0,0 +1,156 @@ +import { WebScraper } from '../utils/WebScraper.js'; +import { ScrapingService } from './ScrapingService.js'; +import { IFeed, NewsSource } from '../types/Feed.js'; +import { IFeedRepository } from '../repositories/FeedRepository.js'; +import { Logger } from '../utils/logger.js'; + +interface ScrapingResult { + success: number; + failed: number; + duplicates: number; + items: (IFeed | null)[]; +} + +interface NewsSourceConfig { + name: string; + source: NewsSource; + webUrls?: string[]; + enabled: boolean; +} + +export class ContentScrapingService { + private webScraper: WebScraper; + private scrapingService: ScrapingService; + + constructor(feedRepository: IFeedRepository) { + this.webScraper = new WebScraper(); + this.scrapingService = new ScrapingService(feedRepository); + } + + + + async scrapeFromWebUrls(urls: string[], source: NewsSource): Promise { + Logger.info(`Starting web scraping from ${urls.length} URLs for ${source}`); + + const feedItems: Omit[] = []; + + for (const url of urls) { + try { + const scrapedData = await this.webScraper.scrapeUrl(url); + if (scrapedData) { + const feedData = this.webScraper.convertToFeedData(scrapedData, source); + feedItems.push(feedData); + } + } catch (error) { + Logger.error(`Error scraping URL ${url}:`, error); + } + } + + if (feedItems.length === 0) { + Logger.warn(`No items scraped from web URLs`); + return { success: 0, failed: urls.length, duplicates: 0, items: [] }; + } + + const results = await this.scrapingService.processFeedBatch(feedItems); + return this.analyzeResults(results); + } + + async scrapeFromSource(config: NewsSourceConfig): Promise { + if (!config.enabled) { + Logger.info(`Skipping disabled source: ${config.name}`); + return { success: 0, failed: 0, duplicates: 0, items: [] }; + } + + Logger.info(`Starting content scraping for source: ${config.name}`); + + let totalResult: ScrapingResult = { success: 0, failed: 0, duplicates: 0, items: [] }; + + // Scrape from web URLs if available + if (config.webUrls && config.webUrls.length > 0) { + const webResult = await this.scrapeFromWebUrls(config.webUrls, config.source); + totalResult = this.mergeResults(totalResult, webResult); + } + + Logger.info(`Completed scraping for ${config.name}: ${totalResult.success} success, ${totalResult.failed} failed, ${totalResult.duplicates} duplicates`); + return totalResult; + } + + async scrapeFromMultipleSources(configs: NewsSourceConfig[]): Promise> { + Logger.info(`Starting batch scraping from ${configs.length} sources`); + + const results = new Map(); + + for (const config of configs) { + try { + const result = await this.scrapeFromSource(config); + results.set(config.name, result); + } catch (error) { + Logger.error(`Error scraping source ${config.name}:`, error); + results.set(config.name, { success: 0, failed: 1, duplicates: 0, items: [] }); + } + } + + const totalStats = this.calculateTotalStats(results); + Logger.info(`Batch scraping completed: ${totalStats.success} total success, ${totalStats.failed} total failed, ${totalStats.duplicates} total duplicates`); + + return results; + } + + private analyzeResults(results: (IFeed | null)[]): ScrapingResult { + const success = results.filter(item => item !== null).length; + const duplicates = results.filter(item => item === null).length; + + return { + success, + failed: 0, // processFeedBatch doesn't fail individual items, it throws on repository errors + duplicates, + items: results + }; + } + + private mergeResults(result1: ScrapingResult, result2: ScrapingResult): ScrapingResult { + return { + success: result1.success + result2.success, + failed: result1.failed + result2.failed, + duplicates: result1.duplicates + result2.duplicates, + items: [...result1.items, ...result2.items] + }; + } + + private calculateTotalStats(results: Map): ScrapingResult { + let totalSuccess = 0; + let totalFailed = 0; + let totalDuplicates = 0; + const allItems: (IFeed | null)[] = []; + + for (const result of results.values()) { + totalSuccess += result.success; + totalFailed += result.failed; + totalDuplicates += result.duplicates; + allItems.push(...result.items); + } + + return { + success: totalSuccess, + failed: totalFailed, + duplicates: totalDuplicates, + items: allItems + }; + } + + // Utility method to create common news source configurations + static createNewsSourceConfigs(): NewsSourceConfig[] { + return [ + { + name: 'El País', + source: NewsSource.EL_PAIS, + enabled: true + }, + { + name: 'El Mundo', + source: NewsSource.EL_MUNDO, + enabled: true + } + ]; + } +} \ No newline at end of file