Files
dailytrends/src/__tests__/ScrapingScheduler.test.ts
2025-07-29 12:44:32 +02:00

317 lines
11 KiB
TypeScript

import { ScrapingScheduler } from '../services/ScrapingScheduler';
import { ContentScrapingService } from '../services/ContentScrapingService';
import { IFeedRepository } from '../repositories/FeedRepository';
import { NewsSource } from '../types/Feed';
// Mock dependencies
jest.mock('../services/ContentScrapingService');
jest.useFakeTimers();
describe('ScrapingScheduler', () => {
let scrapingScheduler: ScrapingScheduler;
let mockFeedRepository: jest.Mocked<IFeedRepository>;
let mockContentScrapingService: jest.Mocked<ContentScrapingService>;
beforeEach(() => {
jest.clearAllMocks();
jest.clearAllTimers();
mockFeedRepository = {
create: jest.fn(),
findAll: jest.fn(),
findById: jest.fn(),
findByUrl: jest.fn(),
findBySource: jest.fn(),
findTodaysFrontPage: jest.fn(),
update: jest.fn(),
delete: jest.fn(),
deleteMany: jest.fn(),
count: jest.fn(),
exists: jest.fn()
};
mockContentScrapingService = {
scrapeFromMultipleSources: jest.fn(),
scrapeFromWebUrls: jest.fn(),
scrapeFromSource: jest.fn()
} as unknown as jest.Mocked<ContentScrapingService>;
// Mock ContentScrapingService constructor
(ContentScrapingService as jest.MockedClass<typeof ContentScrapingService>)
.mockImplementation(() => mockContentScrapingService);
// Mock static method
(ContentScrapingService.createNewsSourceConfigs as jest.Mock) = jest.fn().mockReturnValue([
{
name: 'El País',
source: NewsSource.EL_PAIS,
webUrls: ['https://elpais.com'],
enabled: true
},
{
name: 'El Mundo',
source: NewsSource.EL_MUNDO,
webUrls: ['https://elmundo.es'],
enabled: true
}
]);
scrapingScheduler = new ScrapingScheduler(mockFeedRepository, {
intervalMinutes: 1, // 1 minute for testing
maxRetries: 2,
retryDelayMinutes: 1,
enabled: true
});
});
afterEach(() => {
scrapingScheduler.stop();
});
describe('Basic Functionality', () => {
test('should create ScrapingScheduler instance with default config', () => {
const defaultScheduler = new ScrapingScheduler(mockFeedRepository);
const config = defaultScheduler.getConfig();
expect(config).toEqual({
intervalMinutes: 30,
maxRetries: 3,
retryDelayMinutes: 5,
enabled: true
});
});
test('should create ScrapingScheduler instance with custom config', () => {
const customConfig = {
intervalMinutes: 15,
maxRetries: 5,
retryDelayMinutes: 2,
enabled: false
};
const customScheduler = new ScrapingScheduler(mockFeedRepository, customConfig);
const config = customScheduler.getConfig();
expect(config).toEqual(customConfig);
});
test('should initialize with empty stats', () => {
const stats = scrapingScheduler.getStats();
expect(stats).toEqual({
lastRun: null,
nextRun: null,
totalRuns: 0,
successfulRuns: 0,
failedRuns: 0,
totalItemsScraped: 0,
totalDuplicates: 0
});
});
});
describe('Scheduler Control', () => {
test('should start and stop scheduler', () => {
expect(scrapingScheduler.isSchedulerRunning()).toBe(false);
scrapingScheduler.start();
expect(scrapingScheduler.isSchedulerRunning()).toBe(true);
scrapingScheduler.stop();
expect(scrapingScheduler.isSchedulerRunning()).toBe(false);
});
test('should not start if already running', () => {
scrapingScheduler.start();
const firstStart = scrapingScheduler.isSchedulerRunning();
scrapingScheduler.start(); // Try to start again
const secondStart = scrapingScheduler.isSchedulerRunning();
expect(firstStart).toBe(true);
expect(secondStart).toBe(true);
expect(jest.getTimerCount()).toBe(1); // Only one timer should be active
});
test('should not start if disabled', () => {
const disabledScheduler = new ScrapingScheduler(mockFeedRepository, { enabled: false });
disabledScheduler.start();
expect(disabledScheduler.isSchedulerRunning()).toBe(false);
});
});
describe('Scraping Cycle', () => {
test('should run successful scraping cycle', async () => {
const mockResults = new Map([
['El País', { success: 5, failed: 0, duplicates: 2, items: [] }],
['El Mundo', { success: 3, failed: 0, duplicates: 1, items: [] }]
]);
mockContentScrapingService.scrapeFromMultipleSources.mockResolvedValue(mockResults);
await scrapingScheduler.runScrapingCycle();
const stats = scrapingScheduler.getStats();
expect(stats.totalRuns).toBe(1);
expect(stats.successfulRuns).toBe(1);
expect(stats.failedRuns).toBe(0);
expect(stats.totalItemsScraped).toBe(8); // 5 + 3
expect(stats.totalDuplicates).toBe(3); // 2 + 1
expect(stats.lastRun).toBeInstanceOf(Date);
});
test.skip('should handle scraping cycle errors with retries', async () => {
mockContentScrapingService.scrapeFromMultipleSources
.mockRejectedValueOnce(new Error('First attempt failed'))
.mockRejectedValueOnce(new Error('Second attempt failed'))
.mockResolvedValueOnce(new Map([
['El País', { success: 2, failed: 0, duplicates: 1, items: [] }]
]));
await scrapingScheduler.runScrapingCycle();
const stats = scrapingScheduler.getStats();
expect(stats.totalRuns).toBe(1);
expect(stats.successfulRuns).toBe(1);
expect(stats.failedRuns).toBe(0);
expect(mockContentScrapingService.scrapeFromMultipleSources).toHaveBeenCalledTimes(3);
});
test.skip('should fail after max retries', async () => {
mockContentScrapingService.scrapeFromMultipleSources
.mockRejectedValue(new Error('Persistent failure'));
await scrapingScheduler.runScrapingCycle();
const stats = scrapingScheduler.getStats();
expect(stats.totalRuns).toBe(1);
expect(stats.successfulRuns).toBe(0);
expect(stats.failedRuns).toBe(1);
expect(mockContentScrapingService.scrapeFromMultipleSources).toHaveBeenCalledTimes(3); // 1 + 2 retries
}, 10000);
test.skip('should not run concurrent cycles', async () => {
let resolveFirst: () => void;
const firstPromise = new Promise<void>(resolve => {
resolveFirst = resolve;
});
mockContentScrapingService.scrapeFromMultipleSources.mockImplementation(() => firstPromise.then(() => new Map()));
// Start first cycle
const firstCycle = scrapingScheduler.runScrapingCycle();
expect(scrapingScheduler.isCycleRunning()).toBe(true);
// Try to start second cycle while first is running
const secondCycle = scrapingScheduler.runScrapingCycle();
// Resolve first cycle
resolveFirst!();
await firstCycle;
await secondCycle;
const stats = scrapingScheduler.getStats();
expect(stats.totalRuns).toBe(1); // Only one cycle should have run
expect(mockContentScrapingService.scrapeFromMultipleSources).toHaveBeenCalledTimes(1);
}, 10000);
});
describe('Single Source Scraping', () => {
test('should run single source scraping successfully', async () => {
const mockResult = { success: 3, failed: 0, duplicates: 1, items: [] };
mockContentScrapingService.scrapeFromSource.mockResolvedValue(mockResult);
await scrapingScheduler.runSingleSource('El País');
expect(mockContentScrapingService.scrapeFromSource).toHaveBeenCalledWith({
name: 'El País',
source: NewsSource.EL_PAIS,
webUrls: ['https://elpais.com'],
enabled: true
});
});
test('should handle unknown source name', async () => {
await expect(scrapingScheduler.runSingleSource('Unknown Source'))
.rejects.toThrow('Source configuration not found: Unknown Source');
});
test('should handle single source scraping errors', async () => {
mockContentScrapingService.scrapeFromSource.mockRejectedValue(new Error('Scraping failed'));
await expect(scrapingScheduler.runSingleSource('El País'))
.rejects.toThrow('Scraping failed');
});
});
describe('Configuration Management', () => {
test('should update configuration', () => {
const newConfig = {
intervalMinutes: 60,
maxRetries: 5
};
scrapingScheduler.updateConfig(newConfig);
const config = scrapingScheduler.getConfig();
expect(config.intervalMinutes).toBe(60);
expect(config.maxRetries).toBe(5);
expect(config.retryDelayMinutes).toBe(1); // Should keep existing value
expect(config.enabled).toBe(true); // Should keep existing value
});
test('should restart scheduler when updating config while running', () => {
scrapingScheduler.start();
expect(scrapingScheduler.isSchedulerRunning()).toBe(true);
scrapingScheduler.updateConfig({ intervalMinutes: 60 });
expect(scrapingScheduler.isSchedulerRunning()).toBe(true);
expect(scrapingScheduler.getConfig().intervalMinutes).toBe(60);
});
test('should not restart scheduler when updating config while stopped', () => {
expect(scrapingScheduler.isSchedulerRunning()).toBe(false);
scrapingScheduler.updateConfig({ intervalMinutes: 60 });
expect(scrapingScheduler.isSchedulerRunning()).toBe(false);
});
});
describe('Statistics Management', () => {
test('should reset statistics', () => {
// Simulate some activity
scrapingScheduler.start();
const statsBeforeReset = scrapingScheduler.getStats();
statsBeforeReset.totalRuns = 5;
statsBeforeReset.successfulRuns = 3;
statsBeforeReset.totalItemsScraped = 100;
scrapingScheduler.resetStats();
const statsAfterReset = scrapingScheduler.getStats();
expect(statsAfterReset.totalRuns).toBe(0);
expect(statsAfterReset.successfulRuns).toBe(0);
expect(statsAfterReset.failedRuns).toBe(0);
expect(statsAfterReset.totalItemsScraped).toBe(0);
expect(statsAfterReset.totalDuplicates).toBe(0);
expect(statsAfterReset.lastRun).toBeNull();
});
});
describe('Graceful Shutdown', () => {
test('should shutdown gracefully when not running', async () => {
await expect(scrapingScheduler.shutdown()).resolves.not.toThrow();
expect(scrapingScheduler.isSchedulerRunning()).toBe(false);
});
test.skip('should shutdown gracefully when running', async () => {
scrapingScheduler.start();
expect(scrapingScheduler.isSchedulerRunning()).toBe(true);
await scrapingScheduler.shutdown();
expect(scrapingScheduler.isSchedulerRunning()).toBe(false);
}, 10000);
});
});