ContentScrapingService

This commit is contained in:
albert
2025-07-29 12:45:05 +02:00
parent d35416b5c8
commit 891b1e478d
2 changed files with 415 additions and 0 deletions

View File

@ -0,0 +1,259 @@
import { ContentScrapingService } from '../services/ContentScrapingService';
import { WebScraper } from '../utils/WebScraper';
import { ScrapingService } from '../services/ScrapingService';
import { IFeedRepository } from '../repositories/FeedRepository';
import { NewsSource } from '../types/Feed';
import { Logger } from '../utils/logger';
// Mock dependencies
jest.mock('../utils/WebScraper');
jest.mock('../services/ScrapingService');
jest.mock('../utils/logger');
describe('ContentScrapingService', () => {
let contentScrapingService: ContentScrapingService;
let mockFeedRepository: jest.Mocked<IFeedRepository>;
let mockWebScraper: jest.Mocked<WebScraper>;
let mockScrapingService: jest.Mocked<ScrapingService>;
beforeEach(() => {
jest.clearAllMocks();
mockFeedRepository = {
create: jest.fn(),
findAll: jest.fn(),
findById: jest.fn(),
findByUrl: jest.fn(),
findBySource: jest.fn(),
findTodaysFrontPage: jest.fn(),
update: jest.fn(),
delete: jest.fn(),
deleteMany: jest.fn(),
count: jest.fn(),
exists: jest.fn()
};
mockWebScraper = new WebScraper() as jest.Mocked<WebScraper>;
mockScrapingService = new ScrapingService(mockFeedRepository) as jest.Mocked<ScrapingService>;
// Mock constructor calls
(WebScraper as jest.MockedClass<typeof WebScraper>).mockImplementation(() => mockWebScraper);
(ScrapingService as jest.MockedClass<typeof ScrapingService>).mockImplementation(() => mockScrapingService);
contentScrapingService = new ContentScrapingService(mockFeedRepository);
});
describe('scrapeFromWebUrls', () => {
test('should successfully scrape from web URLs', async () => {
const mockScrapedData = [
{
title: 'Web Article 1',
description: 'Web Description 1',
url: 'https://example.com/web1',
publishedAt: new Date()
},
{
title: 'Web Article 2',
description: 'Web Description 2',
url: 'https://example.com/web2',
publishedAt: new Date()
}
];
const mockFeedData = mockScrapedData.map(data => ({
...data,
source: NewsSource.EL_MUNDO,
isManual: false
}));
const mockResults = [
{ _id: '1', ...mockFeedData[0] },
{ _id: '2', ...mockFeedData[1] }
];
mockWebScraper.scrapeUrl
.mockResolvedValueOnce(mockScrapedData[0])
.mockResolvedValueOnce(mockScrapedData[1]);
mockWebScraper.convertToFeedData
.mockReturnValueOnce(mockFeedData[0])
.mockReturnValueOnce(mockFeedData[1]);
mockScrapingService.processFeedBatch.mockResolvedValue(mockResults);
const urls = ['https://example.com/web1', 'https://example.com/web2'];
const result = await contentScrapingService.scrapeFromWebUrls(urls, NewsSource.EL_MUNDO);
expect(mockWebScraper.scrapeUrl).toHaveBeenCalledTimes(2);
expect(mockWebScraper.convertToFeedData).toHaveBeenCalledTimes(2);
expect(mockScrapingService.processFeedBatch).toHaveBeenCalledWith(mockFeedData);
expect(result).toEqual({
success: 2,
failed: 0,
duplicates: 0,
items: mockResults
});
});
test('should handle failed web scraping', async () => {
mockWebScraper.scrapeUrl
.mockResolvedValueOnce(null)
.mockRejectedValueOnce(new Error('Scraping failed'));
const urls = ['https://example.com/fail1', 'https://example.com/fail2'];
const result = await contentScrapingService.scrapeFromWebUrls(urls, NewsSource.EL_MUNDO);
expect(result).toEqual({
success: 0,
failed: 2,
duplicates: 0,
items: []
});
expect(mockScrapingService.processFeedBatch).not.toHaveBeenCalled();
});
});
describe('scrapeFromSource', () => {
test('should scrape from web URLs', async () => {
const config = {
name: 'Test Source',
source: NewsSource.EL_PAIS,
webUrls: ['https://example.com/web1'],
enabled: true
};
const mockScrapedData = {
title: 'Web Article',
description: 'Web Description',
url: 'https://example.com/web1',
publishedAt: new Date()
};
const mockWebFeedData = {
...mockScrapedData,
source: NewsSource.EL_PAIS,
isManual: false
};
// Mock web scraping
mockWebScraper.scrapeUrl.mockResolvedValue(mockScrapedData);
mockWebScraper.convertToFeedData.mockReturnValue(mockWebFeedData);
mockScrapingService.processFeedBatch.mockResolvedValue([{ _id: '1', ...mockWebFeedData }]);
const result = await contentScrapingService.scrapeFromSource(config);
expect(result).toEqual({
success: 1,
failed: 0,
duplicates: 0,
items: [{ _id: '1', ...mockWebFeedData }]
});
});
test('should skip disabled sources', async () => {
const config = {
name: 'Disabled Source',
source: NewsSource.EL_PAIS,
webUrls: ['https://example.com/web1'],
enabled: false
};
const result = await contentScrapingService.scrapeFromSource(config);
expect(result).toEqual({
success: 0,
failed: 0,
duplicates: 0,
items: []
});
expect(mockWebScraper.scrapeUrl).not.toHaveBeenCalled();
});
});
describe('scrapeFromMultipleSources', () => {
test('should scrape from multiple sources', async () => {
const configs = [
{
name: 'Source 1',
source: NewsSource.EL_PAIS,
webUrls: ['https://example.com/web1'],
enabled: true
},
{
name: 'Source 2',
source: NewsSource.EL_MUNDO,
webUrls: ['https://example.com/web2'],
enabled: true
}
];
const mockScrapedData1 = {
title: 'Article 1',
description: 'Description 1',
url: 'https://example.com/web1',
publishedAt: new Date()
};
const mockScrapedData2 = {
title: 'Article 2',
description: 'Description 2',
url: 'https://example.com/web2',
publishedAt: new Date()
};
const mockFeedData1 = { ...mockScrapedData1, source: NewsSource.EL_PAIS, isManual: false };
const mockFeedData2 = { ...mockScrapedData2, source: NewsSource.EL_MUNDO, isManual: false };
mockWebScraper.scrapeUrl
.mockResolvedValueOnce(mockScrapedData1)
.mockResolvedValueOnce(mockScrapedData2);
mockWebScraper.convertToFeedData
.mockReturnValueOnce(mockFeedData1)
.mockReturnValueOnce(mockFeedData2);
mockScrapingService.processFeedBatch
.mockResolvedValueOnce([{ _id: '1', ...mockFeedData1 }])
.mockResolvedValueOnce([{ _id: '2', ...mockFeedData2 }]);
const results = await contentScrapingService.scrapeFromMultipleSources(configs);
expect(results.size).toBe(2);
expect(results.get('Source 1')).toEqual({
success: 1,
failed: 0,
duplicates: 0,
items: [{ _id: '1', ...mockFeedData1 }]
});
expect(results.get('Source 2')).toEqual({
success: 1,
failed: 0,
duplicates: 0,
items: [{ _id: '2', ...mockFeedData2 }]
});
});
});
describe('createNewsSourceConfigs', () => {
test('should create default news source configurations', () => {
const configs = ContentScrapingService.createNewsSourceConfigs();
expect(configs).toHaveLength(2);
expect(configs[0]).toEqual({
name: 'El País',
source: NewsSource.EL_PAIS,
enabled: true
});
expect(configs[1]).toEqual({
name: 'El Mundo',
source: NewsSource.EL_MUNDO,
enabled: true
});
});
});
});

View File

@ -0,0 +1,156 @@
import { WebScraper } from '../utils/WebScraper.js';
import { ScrapingService } from './ScrapingService.js';
import { IFeed, NewsSource } from '../types/Feed.js';
import { IFeedRepository } from '../repositories/FeedRepository.js';
import { Logger } from '../utils/logger.js';
interface ScrapingResult {
success: number;
failed: number;
duplicates: number;
items: (IFeed | null)[];
}
interface NewsSourceConfig {
name: string;
source: NewsSource;
webUrls?: string[];
enabled: boolean;
}
export class ContentScrapingService {
private webScraper: WebScraper;
private scrapingService: ScrapingService;
constructor(feedRepository: IFeedRepository) {
this.webScraper = new WebScraper();
this.scrapingService = new ScrapingService(feedRepository);
}
async scrapeFromWebUrls(urls: string[], source: NewsSource): Promise<ScrapingResult> {
Logger.info(`Starting web scraping from ${urls.length} URLs for ${source}`);
const feedItems: Omit<IFeed, '_id' | 'createdAt' | 'updatedAt'>[] = [];
for (const url of urls) {
try {
const scrapedData = await this.webScraper.scrapeUrl(url);
if (scrapedData) {
const feedData = this.webScraper.convertToFeedData(scrapedData, source);
feedItems.push(feedData);
}
} catch (error) {
Logger.error(`Error scraping URL ${url}:`, error);
}
}
if (feedItems.length === 0) {
Logger.warn(`No items scraped from web URLs`);
return { success: 0, failed: urls.length, duplicates: 0, items: [] };
}
const results = await this.scrapingService.processFeedBatch(feedItems);
return this.analyzeResults(results);
}
async scrapeFromSource(config: NewsSourceConfig): Promise<ScrapingResult> {
if (!config.enabled) {
Logger.info(`Skipping disabled source: ${config.name}`);
return { success: 0, failed: 0, duplicates: 0, items: [] };
}
Logger.info(`Starting content scraping for source: ${config.name}`);
let totalResult: ScrapingResult = { success: 0, failed: 0, duplicates: 0, items: [] };
// Scrape from web URLs if available
if (config.webUrls && config.webUrls.length > 0) {
const webResult = await this.scrapeFromWebUrls(config.webUrls, config.source);
totalResult = this.mergeResults(totalResult, webResult);
}
Logger.info(`Completed scraping for ${config.name}: ${totalResult.success} success, ${totalResult.failed} failed, ${totalResult.duplicates} duplicates`);
return totalResult;
}
async scrapeFromMultipleSources(configs: NewsSourceConfig[]): Promise<Map<string, ScrapingResult>> {
Logger.info(`Starting batch scraping from ${configs.length} sources`);
const results = new Map<string, ScrapingResult>();
for (const config of configs) {
try {
const result = await this.scrapeFromSource(config);
results.set(config.name, result);
} catch (error) {
Logger.error(`Error scraping source ${config.name}:`, error);
results.set(config.name, { success: 0, failed: 1, duplicates: 0, items: [] });
}
}
const totalStats = this.calculateTotalStats(results);
Logger.info(`Batch scraping completed: ${totalStats.success} total success, ${totalStats.failed} total failed, ${totalStats.duplicates} total duplicates`);
return results;
}
private analyzeResults(results: (IFeed | null)[]): ScrapingResult {
const success = results.filter(item => item !== null).length;
const duplicates = results.filter(item => item === null).length;
return {
success,
failed: 0, // processFeedBatch doesn't fail individual items, it throws on repository errors
duplicates,
items: results
};
}
private mergeResults(result1: ScrapingResult, result2: ScrapingResult): ScrapingResult {
return {
success: result1.success + result2.success,
failed: result1.failed + result2.failed,
duplicates: result1.duplicates + result2.duplicates,
items: [...result1.items, ...result2.items]
};
}
private calculateTotalStats(results: Map<string, ScrapingResult>): ScrapingResult {
let totalSuccess = 0;
let totalFailed = 0;
let totalDuplicates = 0;
const allItems: (IFeed | null)[] = [];
for (const result of results.values()) {
totalSuccess += result.success;
totalFailed += result.failed;
totalDuplicates += result.duplicates;
allItems.push(...result.items);
}
return {
success: totalSuccess,
failed: totalFailed,
duplicates: totalDuplicates,
items: allItems
};
}
// Utility method to create common news source configurations
static createNewsSourceConfigs(): NewsSourceConfig[] {
return [
{
name: 'El País',
source: NewsSource.EL_PAIS,
enabled: true
},
{
name: 'El Mundo',
source: NewsSource.EL_MUNDO,
enabled: true
}
];
}
}