ContentScrapingService

2025-07-29 12:45:05 +02:00
parent d35416b5c8
commit 891b1e478d
2 changed files with 415 additions and 0 deletions
--- a/src/tests/ContentScrapingService.test.ts
+++ b/src/tests/ContentScrapingService.test.ts
@@ -0,0 +1,259 @@
+import { ContentScrapingService } from '../services/ContentScrapingService';
+import { WebScraper } from '../utils/WebScraper';
+import { ScrapingService } from '../services/ScrapingService';
+import { IFeedRepository } from '../repositories/FeedRepository';
+import { NewsSource } from '../types/Feed';
+import { Logger } from '../utils/logger';
+
+// Mock dependencies
+jest.mock('../utils/WebScraper');
+jest.mock('../services/ScrapingService');
+jest.mock('../utils/logger');
+
+describe('ContentScrapingService', () => {
+  let contentScrapingService: ContentScrapingService;
+  let mockFeedRepository: jest.Mocked<IFeedRepository>;
+  let mockWebScraper: jest.Mocked<WebScraper>;
+
+  let mockScrapingService: jest.Mocked<ScrapingService>;
+
+  beforeEach(() => {
+    jest.clearAllMocks();
+    
+    mockFeedRepository = {
+      create: jest.fn(),
+      findAll: jest.fn(),
+      findById: jest.fn(),
+      findByUrl: jest.fn(),
+      findBySource: jest.fn(),
+      findTodaysFrontPage: jest.fn(),
+      update: jest.fn(),
+      delete: jest.fn(),
+      deleteMany: jest.fn(),
+      count: jest.fn(),
+      exists: jest.fn()
+    };
+
+    mockWebScraper = new WebScraper() as jest.Mocked<WebScraper>;
+
+    mockScrapingService = new ScrapingService(mockFeedRepository) as jest.Mocked<ScrapingService>;
+
+    // Mock constructor calls
+    (WebScraper as jest.MockedClass<typeof WebScraper>).mockImplementation(() => mockWebScraper);
+
+    (ScrapingService as jest.MockedClass<typeof ScrapingService>).mockImplementation(() => mockScrapingService);
+
+    contentScrapingService = new ContentScrapingService(mockFeedRepository);
+  });
+
+
+
+  describe('scrapeFromWebUrls', () => {
+    test('should successfully scrape from web URLs', async () => {
+      const mockScrapedData = [
+        {
+          title: 'Web Article 1',
+          description: 'Web Description 1',
+          url: 'https://example.com/web1',
+          publishedAt: new Date()
+        },
+        {
+          title: 'Web Article 2',
+          description: 'Web Description 2',
+          url: 'https://example.com/web2',
+          publishedAt: new Date()
+        }
+      ];
+
+      const mockFeedData = mockScrapedData.map(data => ({
+        ...data,
+        source: NewsSource.EL_MUNDO,
+        isManual: false
+      }));
+
+      const mockResults = [
+        { _id: '1', ...mockFeedData[0] },
+        { _id: '2', ...mockFeedData[1] }
+      ];
+
+      mockWebScraper.scrapeUrl
+        .mockResolvedValueOnce(mockScrapedData[0])
+        .mockResolvedValueOnce(mockScrapedData[1]);
+      
+      mockWebScraper.convertToFeedData
+        .mockReturnValueOnce(mockFeedData[0])
+        .mockReturnValueOnce(mockFeedData[1]);
+
+      mockScrapingService.processFeedBatch.mockResolvedValue(mockResults);
+
+      const urls = ['https://example.com/web1', 'https://example.com/web2'];
+      const result = await contentScrapingService.scrapeFromWebUrls(urls, NewsSource.EL_MUNDO);
+
+      expect(mockWebScraper.scrapeUrl).toHaveBeenCalledTimes(2);
+      expect(mockWebScraper.convertToFeedData).toHaveBeenCalledTimes(2);
+      expect(mockScrapingService.processFeedBatch).toHaveBeenCalledWith(mockFeedData);
+      expect(result).toEqual({
+        success: 2,
+        failed: 0,
+        duplicates: 0,
+        items: mockResults
+      });
+    });
+
+    test('should handle failed web scraping', async () => {
+      mockWebScraper.scrapeUrl
+        .mockResolvedValueOnce(null)
+        .mockRejectedValueOnce(new Error('Scraping failed'));
+
+      const urls = ['https://example.com/fail1', 'https://example.com/fail2'];
+      const result = await contentScrapingService.scrapeFromWebUrls(urls, NewsSource.EL_MUNDO);
+
+      expect(result).toEqual({
+        success: 0,
+        failed: 2,
+        duplicates: 0,
+        items: []
+      });
+      expect(mockScrapingService.processFeedBatch).not.toHaveBeenCalled();
+    });
+  });
+
+  describe('scrapeFromSource', () => {
+    test('should scrape from web URLs', async () => {
+      const config = {
+        name: 'Test Source',
+        source: NewsSource.EL_PAIS,
+        webUrls: ['https://example.com/web1'],
+        enabled: true
+      };
+
+      const mockScrapedData = {
+        title: 'Web Article',
+        description: 'Web Description',
+        url: 'https://example.com/web1',
+        publishedAt: new Date()
+      };
+
+      const mockWebFeedData = {
+        ...mockScrapedData,
+        source: NewsSource.EL_PAIS,
+        isManual: false
+      };
+
+      // Mock web scraping
+      mockWebScraper.scrapeUrl.mockResolvedValue(mockScrapedData);
+      mockWebScraper.convertToFeedData.mockReturnValue(mockWebFeedData);
+      mockScrapingService.processFeedBatch.mockResolvedValue([{ _id: '1', ...mockWebFeedData }]);
+
+      const result = await contentScrapingService.scrapeFromSource(config);
+
+      expect(result).toEqual({
+        success: 1,
+        failed: 0,
+        duplicates: 0,
+        items: [{ _id: '1', ...mockWebFeedData }]
+      });
+    });
+
+    test('should skip disabled sources', async () => {
+      const config = {
+        name: 'Disabled Source',
+        source: NewsSource.EL_PAIS,
+        webUrls: ['https://example.com/web1'],
+        enabled: false
+      };
+
+      const result = await contentScrapingService.scrapeFromSource(config);
+
+      expect(result).toEqual({
+        success: 0,
+        failed: 0,
+        duplicates: 0,
+        items: []
+      });
+      expect(mockWebScraper.scrapeUrl).not.toHaveBeenCalled();
+    });
+  });
+
+  describe('scrapeFromMultipleSources', () => {
+    test('should scrape from multiple sources', async () => {
+      const configs = [
+        {
+          name: 'Source 1',
+          source: NewsSource.EL_PAIS,
+          webUrls: ['https://example.com/web1'],
+          enabled: true
+        },
+        {
+          name: 'Source 2',
+          source: NewsSource.EL_MUNDO,
+          webUrls: ['https://example.com/web2'],
+          enabled: true
+        }
+      ];
+
+      const mockScrapedData1 = {
+        title: 'Article 1',
+        description: 'Description 1',
+        url: 'https://example.com/web1',
+        publishedAt: new Date()
+      };
+
+      const mockScrapedData2 = {
+        title: 'Article 2',
+        description: 'Description 2',
+        url: 'https://example.com/web2',
+        publishedAt: new Date()
+      };
+
+      const mockFeedData1 = { ...mockScrapedData1, source: NewsSource.EL_PAIS, isManual: false };
+      const mockFeedData2 = { ...mockScrapedData2, source: NewsSource.EL_MUNDO, isManual: false };
+
+      mockWebScraper.scrapeUrl
+        .mockResolvedValueOnce(mockScrapedData1)
+        .mockResolvedValueOnce(mockScrapedData2);
+      
+      mockWebScraper.convertToFeedData
+        .mockReturnValueOnce(mockFeedData1)
+        .mockReturnValueOnce(mockFeedData2);
+
+      mockScrapingService.processFeedBatch
+        .mockResolvedValueOnce([{ _id: '1', ...mockFeedData1 }])
+        .mockResolvedValueOnce([{ _id: '2', ...mockFeedData2 }]);
+
+      const results = await contentScrapingService.scrapeFromMultipleSources(configs);
+
+      expect(results.size).toBe(2);
+      expect(results.get('Source 1')).toEqual({
+        success: 1,
+        failed: 0,
+        duplicates: 0,
+        items: [{ _id: '1', ...mockFeedData1 }]
+      });
+      expect(results.get('Source 2')).toEqual({
+        success: 1,
+        failed: 0,
+        duplicates: 0,
+        items: [{ _id: '2', ...mockFeedData2 }]
+      });
+    });
+  });
+
+  describe('createNewsSourceConfigs', () => {
+    test('should create default news source configurations', () => {
+      const configs = ContentScrapingService.createNewsSourceConfigs();
+
+      expect(configs).toHaveLength(2);
+      expect(configs[0]).toEqual({
+        name: 'El País',
+        source: NewsSource.EL_PAIS,
+        enabled: true
+      });
+      expect(configs[1]).toEqual({
+        name: 'El Mundo',
+        source: NewsSource.EL_MUNDO,
+        enabled: true
+      });
+    });
+  });
+});
--- a/src/services/ContentScrapingService.ts
+++ b/src/services/ContentScrapingService.ts
@@ -0,0 +1,156 @@
+import { WebScraper } from '../utils/WebScraper.js';
+import { ScrapingService } from './ScrapingService.js';
+import { IFeed, NewsSource } from '../types/Feed.js';
+import { IFeedRepository } from '../repositories/FeedRepository.js';
+import { Logger } from '../utils/logger.js';
+
+interface ScrapingResult {
+  success: number;
+  failed: number;
+  duplicates: number;
+  items: (IFeed | null)[];
+}
+
+interface NewsSourceConfig {
+  name: string;
+  source: NewsSource;
+  webUrls?: string[];
+  enabled: boolean;
+}
+
+export class ContentScrapingService {
+  private webScraper: WebScraper;
+  private scrapingService: ScrapingService;
+
+  constructor(feedRepository: IFeedRepository) {
+    this.webScraper = new WebScraper();
+    this.scrapingService = new ScrapingService(feedRepository);
+  }
+
+
+
+  async scrapeFromWebUrls(urls: string[], source: NewsSource): Promise<ScrapingResult> {
+    Logger.info(`Starting web scraping from ${urls.length} URLs for ${source}`);
+    
+    const feedItems: Omit<IFeed, '_id' | 'createdAt' | 'updatedAt'>[] = [];
+    
+    for (const url of urls) {
+      try {
+        const scrapedData = await this.webScraper.scrapeUrl(url);
+        if (scrapedData) {
+          const feedData = this.webScraper.convertToFeedData(scrapedData, source);
+          feedItems.push(feedData);
+        }
+      } catch (error) {
+        Logger.error(`Error scraping URL ${url}:`, error);
+      }
+    }
+
+    if (feedItems.length === 0) {
+      Logger.warn(`No items scraped from web URLs`);
+      return { success: 0, failed: urls.length, duplicates: 0, items: [] };
+    }
+
+    const results = await this.scrapingService.processFeedBatch(feedItems);
+    return this.analyzeResults(results);
+  }
+
+  async scrapeFromSource(config: NewsSourceConfig): Promise<ScrapingResult> {
+    if (!config.enabled) {
+      Logger.info(`Skipping disabled source: ${config.name}`);
+      return { success: 0, failed: 0, duplicates: 0, items: [] };
+    }
+
+    Logger.info(`Starting content scraping for source: ${config.name}`);
+    
+    let totalResult: ScrapingResult = { success: 0, failed: 0, duplicates: 0, items: [] };
+
+    // Scrape from web URLs if available
+    if (config.webUrls && config.webUrls.length > 0) {
+      const webResult = await this.scrapeFromWebUrls(config.webUrls, config.source);
+      totalResult = this.mergeResults(totalResult, webResult);
+    }
+
+    Logger.info(`Completed scraping for ${config.name}: ${totalResult.success} success, ${totalResult.failed} failed, ${totalResult.duplicates} duplicates`);
+    return totalResult;
+  }
+
+  async scrapeFromMultipleSources(configs: NewsSourceConfig[]): Promise<Map<string, ScrapingResult>> {
+    Logger.info(`Starting batch scraping from ${configs.length} sources`);
+    
+    const results = new Map<string, ScrapingResult>();
+    
+    for (const config of configs) {
+      try {
+        const result = await this.scrapeFromSource(config);
+        results.set(config.name, result);
+      } catch (error) {
+        Logger.error(`Error scraping source ${config.name}:`, error);
+        results.set(config.name, { success: 0, failed: 1, duplicates: 0, items: [] });
+      }
+    }
+
+    const totalStats = this.calculateTotalStats(results);
+    Logger.info(`Batch scraping completed: ${totalStats.success} total success, ${totalStats.failed} total failed, ${totalStats.duplicates} total duplicates`);
+    
+    return results;
+  }
+
+  private analyzeResults(results: (IFeed | null)[]): ScrapingResult {
+    const success = results.filter(item => item !== null).length;
+    const duplicates = results.filter(item => item === null).length;
+    
+    return {
+      success,
+      failed: 0, // processFeedBatch doesn't fail individual items, it throws on repository errors
+      duplicates,
+      items: results
+    };
+  }
+
+  private mergeResults(result1: ScrapingResult, result2: ScrapingResult): ScrapingResult {
+    return {
+      success: result1.success + result2.success,
+      failed: result1.failed + result2.failed,
+      duplicates: result1.duplicates + result2.duplicates,
+      items: [...result1.items, ...result2.items]
+    };
+  }
+
+  private calculateTotalStats(results: Map<string, ScrapingResult>): ScrapingResult {
+    let totalSuccess = 0;
+    let totalFailed = 0;
+    let totalDuplicates = 0;
+    const allItems: (IFeed | null)[] = [];
+
+    for (const result of results.values()) {
+      totalSuccess += result.success;
+      totalFailed += result.failed;
+      totalDuplicates += result.duplicates;
+      allItems.push(...result.items);
+    }
+
+    return {
+      success: totalSuccess,
+      failed: totalFailed,
+      duplicates: totalDuplicates,
+      items: allItems
+    };
+  }
+
+  // Utility method to create common news source configurations
+  static createNewsSourceConfigs(): NewsSourceConfig[] {
+    return [
+      {
+        name: 'El País',
+        source: NewsSource.EL_PAIS,
+        enabled: true
+      },
+      {
+        name: 'El Mundo',
+        source: NewsSource.EL_MUNDO,
+        enabled: true
+      }
+    ];
+  }
+}