ContentScrapingService
This commit is contained in:
		
							
								
								
									
										259
									
								
								src/__tests__/ContentScrapingService.test.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										259
									
								
								src/__tests__/ContentScrapingService.test.ts
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,259 @@
 | 
			
		||||
import { ContentScrapingService } from '../services/ContentScrapingService';
 | 
			
		||||
import { WebScraper } from '../utils/WebScraper';
 | 
			
		||||
import { ScrapingService } from '../services/ScrapingService';
 | 
			
		||||
import { IFeedRepository } from '../repositories/FeedRepository';
 | 
			
		||||
import { NewsSource } from '../types/Feed';
 | 
			
		||||
import { Logger } from '../utils/logger';
 | 
			
		||||
 | 
			
		||||
// Mock dependencies
 | 
			
		||||
jest.mock('../utils/WebScraper');
 | 
			
		||||
jest.mock('../services/ScrapingService');
 | 
			
		||||
jest.mock('../utils/logger');
 | 
			
		||||
 | 
			
		||||
describe('ContentScrapingService', () => {
 | 
			
		||||
  let contentScrapingService: ContentScrapingService;
 | 
			
		||||
  let mockFeedRepository: jest.Mocked<IFeedRepository>;
 | 
			
		||||
  let mockWebScraper: jest.Mocked<WebScraper>;
 | 
			
		||||
 | 
			
		||||
  let mockScrapingService: jest.Mocked<ScrapingService>;
 | 
			
		||||
 | 
			
		||||
  beforeEach(() => {
 | 
			
		||||
    jest.clearAllMocks();
 | 
			
		||||
    
 | 
			
		||||
    mockFeedRepository = {
 | 
			
		||||
      create: jest.fn(),
 | 
			
		||||
      findAll: jest.fn(),
 | 
			
		||||
      findById: jest.fn(),
 | 
			
		||||
      findByUrl: jest.fn(),
 | 
			
		||||
      findBySource: jest.fn(),
 | 
			
		||||
      findTodaysFrontPage: jest.fn(),
 | 
			
		||||
      update: jest.fn(),
 | 
			
		||||
      delete: jest.fn(),
 | 
			
		||||
      deleteMany: jest.fn(),
 | 
			
		||||
      count: jest.fn(),
 | 
			
		||||
      exists: jest.fn()
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    mockWebScraper = new WebScraper() as jest.Mocked<WebScraper>;
 | 
			
		||||
 | 
			
		||||
    mockScrapingService = new ScrapingService(mockFeedRepository) as jest.Mocked<ScrapingService>;
 | 
			
		||||
 | 
			
		||||
    // Mock constructor calls
 | 
			
		||||
    (WebScraper as jest.MockedClass<typeof WebScraper>).mockImplementation(() => mockWebScraper);
 | 
			
		||||
 | 
			
		||||
    (ScrapingService as jest.MockedClass<typeof ScrapingService>).mockImplementation(() => mockScrapingService);
 | 
			
		||||
 | 
			
		||||
    contentScrapingService = new ContentScrapingService(mockFeedRepository);
 | 
			
		||||
  });
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  describe('scrapeFromWebUrls', () => {
 | 
			
		||||
    test('should successfully scrape from web URLs', async () => {
 | 
			
		||||
      const mockScrapedData = [
 | 
			
		||||
        {
 | 
			
		||||
          title: 'Web Article 1',
 | 
			
		||||
          description: 'Web Description 1',
 | 
			
		||||
          url: 'https://example.com/web1',
 | 
			
		||||
          publishedAt: new Date()
 | 
			
		||||
        },
 | 
			
		||||
        {
 | 
			
		||||
          title: 'Web Article 2',
 | 
			
		||||
          description: 'Web Description 2',
 | 
			
		||||
          url: 'https://example.com/web2',
 | 
			
		||||
          publishedAt: new Date()
 | 
			
		||||
        }
 | 
			
		||||
      ];
 | 
			
		||||
 | 
			
		||||
      const mockFeedData = mockScrapedData.map(data => ({
 | 
			
		||||
        ...data,
 | 
			
		||||
        source: NewsSource.EL_MUNDO,
 | 
			
		||||
        isManual: false
 | 
			
		||||
      }));
 | 
			
		||||
 | 
			
		||||
      const mockResults = [
 | 
			
		||||
        { _id: '1', ...mockFeedData[0] },
 | 
			
		||||
        { _id: '2', ...mockFeedData[1] }
 | 
			
		||||
      ];
 | 
			
		||||
 | 
			
		||||
      mockWebScraper.scrapeUrl
 | 
			
		||||
        .mockResolvedValueOnce(mockScrapedData[0])
 | 
			
		||||
        .mockResolvedValueOnce(mockScrapedData[1]);
 | 
			
		||||
      
 | 
			
		||||
      mockWebScraper.convertToFeedData
 | 
			
		||||
        .mockReturnValueOnce(mockFeedData[0])
 | 
			
		||||
        .mockReturnValueOnce(mockFeedData[1]);
 | 
			
		||||
 | 
			
		||||
      mockScrapingService.processFeedBatch.mockResolvedValue(mockResults);
 | 
			
		||||
 | 
			
		||||
      const urls = ['https://example.com/web1', 'https://example.com/web2'];
 | 
			
		||||
      const result = await contentScrapingService.scrapeFromWebUrls(urls, NewsSource.EL_MUNDO);
 | 
			
		||||
 | 
			
		||||
      expect(mockWebScraper.scrapeUrl).toHaveBeenCalledTimes(2);
 | 
			
		||||
      expect(mockWebScraper.convertToFeedData).toHaveBeenCalledTimes(2);
 | 
			
		||||
      expect(mockScrapingService.processFeedBatch).toHaveBeenCalledWith(mockFeedData);
 | 
			
		||||
      expect(result).toEqual({
 | 
			
		||||
        success: 2,
 | 
			
		||||
        failed: 0,
 | 
			
		||||
        duplicates: 0,
 | 
			
		||||
        items: mockResults
 | 
			
		||||
      });
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    test('should handle failed web scraping', async () => {
 | 
			
		||||
      mockWebScraper.scrapeUrl
 | 
			
		||||
        .mockResolvedValueOnce(null)
 | 
			
		||||
        .mockRejectedValueOnce(new Error('Scraping failed'));
 | 
			
		||||
 | 
			
		||||
      const urls = ['https://example.com/fail1', 'https://example.com/fail2'];
 | 
			
		||||
      const result = await contentScrapingService.scrapeFromWebUrls(urls, NewsSource.EL_MUNDO);
 | 
			
		||||
 | 
			
		||||
      expect(result).toEqual({
 | 
			
		||||
        success: 0,
 | 
			
		||||
        failed: 2,
 | 
			
		||||
        duplicates: 0,
 | 
			
		||||
        items: []
 | 
			
		||||
      });
 | 
			
		||||
      expect(mockScrapingService.processFeedBatch).not.toHaveBeenCalled();
 | 
			
		||||
    });
 | 
			
		||||
  });
 | 
			
		||||
 | 
			
		||||
  describe('scrapeFromSource', () => {
 | 
			
		||||
    test('should scrape from web URLs', async () => {
 | 
			
		||||
      const config = {
 | 
			
		||||
        name: 'Test Source',
 | 
			
		||||
        source: NewsSource.EL_PAIS,
 | 
			
		||||
        webUrls: ['https://example.com/web1'],
 | 
			
		||||
        enabled: true
 | 
			
		||||
      };
 | 
			
		||||
 | 
			
		||||
      const mockScrapedData = {
 | 
			
		||||
        title: 'Web Article',
 | 
			
		||||
        description: 'Web Description',
 | 
			
		||||
        url: 'https://example.com/web1',
 | 
			
		||||
        publishedAt: new Date()
 | 
			
		||||
      };
 | 
			
		||||
 | 
			
		||||
      const mockWebFeedData = {
 | 
			
		||||
        ...mockScrapedData,
 | 
			
		||||
        source: NewsSource.EL_PAIS,
 | 
			
		||||
        isManual: false
 | 
			
		||||
      };
 | 
			
		||||
 | 
			
		||||
      // Mock web scraping
 | 
			
		||||
      mockWebScraper.scrapeUrl.mockResolvedValue(mockScrapedData);
 | 
			
		||||
      mockWebScraper.convertToFeedData.mockReturnValue(mockWebFeedData);
 | 
			
		||||
      mockScrapingService.processFeedBatch.mockResolvedValue([{ _id: '1', ...mockWebFeedData }]);
 | 
			
		||||
 | 
			
		||||
      const result = await contentScrapingService.scrapeFromSource(config);
 | 
			
		||||
 | 
			
		||||
      expect(result).toEqual({
 | 
			
		||||
        success: 1,
 | 
			
		||||
        failed: 0,
 | 
			
		||||
        duplicates: 0,
 | 
			
		||||
        items: [{ _id: '1', ...mockWebFeedData }]
 | 
			
		||||
      });
 | 
			
		||||
    });
 | 
			
		||||
 | 
			
		||||
    test('should skip disabled sources', async () => {
 | 
			
		||||
      const config = {
 | 
			
		||||
        name: 'Disabled Source',
 | 
			
		||||
        source: NewsSource.EL_PAIS,
 | 
			
		||||
        webUrls: ['https://example.com/web1'],
 | 
			
		||||
        enabled: false
 | 
			
		||||
      };
 | 
			
		||||
 | 
			
		||||
      const result = await contentScrapingService.scrapeFromSource(config);
 | 
			
		||||
 | 
			
		||||
      expect(result).toEqual({
 | 
			
		||||
        success: 0,
 | 
			
		||||
        failed: 0,
 | 
			
		||||
        duplicates: 0,
 | 
			
		||||
        items: []
 | 
			
		||||
      });
 | 
			
		||||
      expect(mockWebScraper.scrapeUrl).not.toHaveBeenCalled();
 | 
			
		||||
    });
 | 
			
		||||
  });
 | 
			
		||||
 | 
			
		||||
  describe('scrapeFromMultipleSources', () => {
 | 
			
		||||
    test('should scrape from multiple sources', async () => {
 | 
			
		||||
      const configs = [
 | 
			
		||||
        {
 | 
			
		||||
          name: 'Source 1',
 | 
			
		||||
          source: NewsSource.EL_PAIS,
 | 
			
		||||
          webUrls: ['https://example.com/web1'],
 | 
			
		||||
          enabled: true
 | 
			
		||||
        },
 | 
			
		||||
        {
 | 
			
		||||
          name: 'Source 2',
 | 
			
		||||
          source: NewsSource.EL_MUNDO,
 | 
			
		||||
          webUrls: ['https://example.com/web2'],
 | 
			
		||||
          enabled: true
 | 
			
		||||
        }
 | 
			
		||||
      ];
 | 
			
		||||
 | 
			
		||||
      const mockScrapedData1 = {
 | 
			
		||||
        title: 'Article 1',
 | 
			
		||||
        description: 'Description 1',
 | 
			
		||||
        url: 'https://example.com/web1',
 | 
			
		||||
        publishedAt: new Date()
 | 
			
		||||
      };
 | 
			
		||||
 | 
			
		||||
      const mockScrapedData2 = {
 | 
			
		||||
        title: 'Article 2',
 | 
			
		||||
        description: 'Description 2',
 | 
			
		||||
        url: 'https://example.com/web2',
 | 
			
		||||
        publishedAt: new Date()
 | 
			
		||||
      };
 | 
			
		||||
 | 
			
		||||
      const mockFeedData1 = { ...mockScrapedData1, source: NewsSource.EL_PAIS, isManual: false };
 | 
			
		||||
      const mockFeedData2 = { ...mockScrapedData2, source: NewsSource.EL_MUNDO, isManual: false };
 | 
			
		||||
 | 
			
		||||
      mockWebScraper.scrapeUrl
 | 
			
		||||
        .mockResolvedValueOnce(mockScrapedData1)
 | 
			
		||||
        .mockResolvedValueOnce(mockScrapedData2);
 | 
			
		||||
      
 | 
			
		||||
      mockWebScraper.convertToFeedData
 | 
			
		||||
        .mockReturnValueOnce(mockFeedData1)
 | 
			
		||||
        .mockReturnValueOnce(mockFeedData2);
 | 
			
		||||
 | 
			
		||||
      mockScrapingService.processFeedBatch
 | 
			
		||||
        .mockResolvedValueOnce([{ _id: '1', ...mockFeedData1 }])
 | 
			
		||||
        .mockResolvedValueOnce([{ _id: '2', ...mockFeedData2 }]);
 | 
			
		||||
 | 
			
		||||
      const results = await contentScrapingService.scrapeFromMultipleSources(configs);
 | 
			
		||||
 | 
			
		||||
      expect(results.size).toBe(2);
 | 
			
		||||
      expect(results.get('Source 1')).toEqual({
 | 
			
		||||
        success: 1,
 | 
			
		||||
        failed: 0,
 | 
			
		||||
        duplicates: 0,
 | 
			
		||||
        items: [{ _id: '1', ...mockFeedData1 }]
 | 
			
		||||
      });
 | 
			
		||||
      expect(results.get('Source 2')).toEqual({
 | 
			
		||||
        success: 1,
 | 
			
		||||
        failed: 0,
 | 
			
		||||
        duplicates: 0,
 | 
			
		||||
        items: [{ _id: '2', ...mockFeedData2 }]
 | 
			
		||||
      });
 | 
			
		||||
    });
 | 
			
		||||
  });
 | 
			
		||||
 | 
			
		||||
  describe('createNewsSourceConfigs', () => {
 | 
			
		||||
    test('should create default news source configurations', () => {
 | 
			
		||||
      const configs = ContentScrapingService.createNewsSourceConfigs();
 | 
			
		||||
 | 
			
		||||
      expect(configs).toHaveLength(2);
 | 
			
		||||
      expect(configs[0]).toEqual({
 | 
			
		||||
        name: 'El País',
 | 
			
		||||
        source: NewsSource.EL_PAIS,
 | 
			
		||||
        enabled: true
 | 
			
		||||
      });
 | 
			
		||||
      expect(configs[1]).toEqual({
 | 
			
		||||
        name: 'El Mundo',
 | 
			
		||||
        source: NewsSource.EL_MUNDO,
 | 
			
		||||
        enabled: true
 | 
			
		||||
      });
 | 
			
		||||
    });
 | 
			
		||||
  });
 | 
			
		||||
});
 | 
			
		||||
							
								
								
									
										156
									
								
								src/services/ContentScrapingService.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										156
									
								
								src/services/ContentScrapingService.ts
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,156 @@
 | 
			
		||||
import { WebScraper } from '../utils/WebScraper.js';
 | 
			
		||||
import { ScrapingService } from './ScrapingService.js';
 | 
			
		||||
import { IFeed, NewsSource } from '../types/Feed.js';
 | 
			
		||||
import { IFeedRepository } from '../repositories/FeedRepository.js';
 | 
			
		||||
import { Logger } from '../utils/logger.js';
 | 
			
		||||
 | 
			
		||||
interface ScrapingResult {
 | 
			
		||||
  success: number;
 | 
			
		||||
  failed: number;
 | 
			
		||||
  duplicates: number;
 | 
			
		||||
  items: (IFeed | null)[];
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
interface NewsSourceConfig {
 | 
			
		||||
  name: string;
 | 
			
		||||
  source: NewsSource;
 | 
			
		||||
  webUrls?: string[];
 | 
			
		||||
  enabled: boolean;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
export class ContentScrapingService {
 | 
			
		||||
  private webScraper: WebScraper;
 | 
			
		||||
  private scrapingService: ScrapingService;
 | 
			
		||||
 | 
			
		||||
  constructor(feedRepository: IFeedRepository) {
 | 
			
		||||
    this.webScraper = new WebScraper();
 | 
			
		||||
    this.scrapingService = new ScrapingService(feedRepository);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  async scrapeFromWebUrls(urls: string[], source: NewsSource): Promise<ScrapingResult> {
 | 
			
		||||
    Logger.info(`Starting web scraping from ${urls.length} URLs for ${source}`);
 | 
			
		||||
    
 | 
			
		||||
    const feedItems: Omit<IFeed, '_id' | 'createdAt' | 'updatedAt'>[] = [];
 | 
			
		||||
    
 | 
			
		||||
    for (const url of urls) {
 | 
			
		||||
      try {
 | 
			
		||||
        const scrapedData = await this.webScraper.scrapeUrl(url);
 | 
			
		||||
        if (scrapedData) {
 | 
			
		||||
          const feedData = this.webScraper.convertToFeedData(scrapedData, source);
 | 
			
		||||
          feedItems.push(feedData);
 | 
			
		||||
        }
 | 
			
		||||
      } catch (error) {
 | 
			
		||||
        Logger.error(`Error scraping URL ${url}:`, error);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (feedItems.length === 0) {
 | 
			
		||||
      Logger.warn(`No items scraped from web URLs`);
 | 
			
		||||
      return { success: 0, failed: urls.length, duplicates: 0, items: [] };
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const results = await this.scrapingService.processFeedBatch(feedItems);
 | 
			
		||||
    return this.analyzeResults(results);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  async scrapeFromSource(config: NewsSourceConfig): Promise<ScrapingResult> {
 | 
			
		||||
    if (!config.enabled) {
 | 
			
		||||
      Logger.info(`Skipping disabled source: ${config.name}`);
 | 
			
		||||
      return { success: 0, failed: 0, duplicates: 0, items: [] };
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    Logger.info(`Starting content scraping for source: ${config.name}`);
 | 
			
		||||
    
 | 
			
		||||
    let totalResult: ScrapingResult = { success: 0, failed: 0, duplicates: 0, items: [] };
 | 
			
		||||
 | 
			
		||||
    // Scrape from web URLs if available
 | 
			
		||||
    if (config.webUrls && config.webUrls.length > 0) {
 | 
			
		||||
      const webResult = await this.scrapeFromWebUrls(config.webUrls, config.source);
 | 
			
		||||
      totalResult = this.mergeResults(totalResult, webResult);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    Logger.info(`Completed scraping for ${config.name}: ${totalResult.success} success, ${totalResult.failed} failed, ${totalResult.duplicates} duplicates`);
 | 
			
		||||
    return totalResult;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  async scrapeFromMultipleSources(configs: NewsSourceConfig[]): Promise<Map<string, ScrapingResult>> {
 | 
			
		||||
    Logger.info(`Starting batch scraping from ${configs.length} sources`);
 | 
			
		||||
    
 | 
			
		||||
    const results = new Map<string, ScrapingResult>();
 | 
			
		||||
    
 | 
			
		||||
    for (const config of configs) {
 | 
			
		||||
      try {
 | 
			
		||||
        const result = await this.scrapeFromSource(config);
 | 
			
		||||
        results.set(config.name, result);
 | 
			
		||||
      } catch (error) {
 | 
			
		||||
        Logger.error(`Error scraping source ${config.name}:`, error);
 | 
			
		||||
        results.set(config.name, { success: 0, failed: 1, duplicates: 0, items: [] });
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const totalStats = this.calculateTotalStats(results);
 | 
			
		||||
    Logger.info(`Batch scraping completed: ${totalStats.success} total success, ${totalStats.failed} total failed, ${totalStats.duplicates} total duplicates`);
 | 
			
		||||
    
 | 
			
		||||
    return results;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  private analyzeResults(results: (IFeed | null)[]): ScrapingResult {
 | 
			
		||||
    const success = results.filter(item => item !== null).length;
 | 
			
		||||
    const duplicates = results.filter(item => item === null).length;
 | 
			
		||||
    
 | 
			
		||||
    return {
 | 
			
		||||
      success,
 | 
			
		||||
      failed: 0, // processFeedBatch doesn't fail individual items, it throws on repository errors
 | 
			
		||||
      duplicates,
 | 
			
		||||
      items: results
 | 
			
		||||
    };
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  private mergeResults(result1: ScrapingResult, result2: ScrapingResult): ScrapingResult {
 | 
			
		||||
    return {
 | 
			
		||||
      success: result1.success + result2.success,
 | 
			
		||||
      failed: result1.failed + result2.failed,
 | 
			
		||||
      duplicates: result1.duplicates + result2.duplicates,
 | 
			
		||||
      items: [...result1.items, ...result2.items]
 | 
			
		||||
    };
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  private calculateTotalStats(results: Map<string, ScrapingResult>): ScrapingResult {
 | 
			
		||||
    let totalSuccess = 0;
 | 
			
		||||
    let totalFailed = 0;
 | 
			
		||||
    let totalDuplicates = 0;
 | 
			
		||||
    const allItems: (IFeed | null)[] = [];
 | 
			
		||||
 | 
			
		||||
    for (const result of results.values()) {
 | 
			
		||||
      totalSuccess += result.success;
 | 
			
		||||
      totalFailed += result.failed;
 | 
			
		||||
      totalDuplicates += result.duplicates;
 | 
			
		||||
      allItems.push(...result.items);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return {
 | 
			
		||||
      success: totalSuccess,
 | 
			
		||||
      failed: totalFailed,
 | 
			
		||||
      duplicates: totalDuplicates,
 | 
			
		||||
      items: allItems
 | 
			
		||||
    };
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Utility method to create common news source configurations
 | 
			
		||||
  static createNewsSourceConfigs(): NewsSourceConfig[] {
 | 
			
		||||
    return [
 | 
			
		||||
      {
 | 
			
		||||
        name: 'El País',
 | 
			
		||||
        source: NewsSource.EL_PAIS,
 | 
			
		||||
        enabled: true
 | 
			
		||||
      },
 | 
			
		||||
      {
 | 
			
		||||
        name: 'El Mundo',
 | 
			
		||||
        source: NewsSource.EL_MUNDO,
 | 
			
		||||
        enabled: true
 | 
			
		||||
      }
 | 
			
		||||
    ];
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
		Reference in New Issue
	
	Block a user