ContentScrapingService
This commit is contained in:
259
src/__tests__/ContentScrapingService.test.ts
Normal file
259
src/__tests__/ContentScrapingService.test.ts
Normal file
@ -0,0 +1,259 @@
|
||||
import { ContentScrapingService } from '../services/ContentScrapingService';
|
||||
import { WebScraper } from '../utils/WebScraper';
|
||||
import { ScrapingService } from '../services/ScrapingService';
|
||||
import { IFeedRepository } from '../repositories/FeedRepository';
|
||||
import { NewsSource } from '../types/Feed';
|
||||
import { Logger } from '../utils/logger';
|
||||
|
||||
// Mock dependencies
|
||||
jest.mock('../utils/WebScraper');
|
||||
jest.mock('../services/ScrapingService');
|
||||
jest.mock('../utils/logger');
|
||||
|
||||
describe('ContentScrapingService', () => {
|
||||
let contentScrapingService: ContentScrapingService;
|
||||
let mockFeedRepository: jest.Mocked<IFeedRepository>;
|
||||
let mockWebScraper: jest.Mocked<WebScraper>;
|
||||
|
||||
let mockScrapingService: jest.Mocked<ScrapingService>;
|
||||
|
||||
beforeEach(() => {
|
||||
jest.clearAllMocks();
|
||||
|
||||
mockFeedRepository = {
|
||||
create: jest.fn(),
|
||||
findAll: jest.fn(),
|
||||
findById: jest.fn(),
|
||||
findByUrl: jest.fn(),
|
||||
findBySource: jest.fn(),
|
||||
findTodaysFrontPage: jest.fn(),
|
||||
update: jest.fn(),
|
||||
delete: jest.fn(),
|
||||
deleteMany: jest.fn(),
|
||||
count: jest.fn(),
|
||||
exists: jest.fn()
|
||||
};
|
||||
|
||||
mockWebScraper = new WebScraper() as jest.Mocked<WebScraper>;
|
||||
|
||||
mockScrapingService = new ScrapingService(mockFeedRepository) as jest.Mocked<ScrapingService>;
|
||||
|
||||
// Mock constructor calls
|
||||
(WebScraper as jest.MockedClass<typeof WebScraper>).mockImplementation(() => mockWebScraper);
|
||||
|
||||
(ScrapingService as jest.MockedClass<typeof ScrapingService>).mockImplementation(() => mockScrapingService);
|
||||
|
||||
contentScrapingService = new ContentScrapingService(mockFeedRepository);
|
||||
});
|
||||
|
||||
|
||||
|
||||
describe('scrapeFromWebUrls', () => {
|
||||
test('should successfully scrape from web URLs', async () => {
|
||||
const mockScrapedData = [
|
||||
{
|
||||
title: 'Web Article 1',
|
||||
description: 'Web Description 1',
|
||||
url: 'https://example.com/web1',
|
||||
publishedAt: new Date()
|
||||
},
|
||||
{
|
||||
title: 'Web Article 2',
|
||||
description: 'Web Description 2',
|
||||
url: 'https://example.com/web2',
|
||||
publishedAt: new Date()
|
||||
}
|
||||
];
|
||||
|
||||
const mockFeedData = mockScrapedData.map(data => ({
|
||||
...data,
|
||||
source: NewsSource.EL_MUNDO,
|
||||
isManual: false
|
||||
}));
|
||||
|
||||
const mockResults = [
|
||||
{ _id: '1', ...mockFeedData[0] },
|
||||
{ _id: '2', ...mockFeedData[1] }
|
||||
];
|
||||
|
||||
mockWebScraper.scrapeUrl
|
||||
.mockResolvedValueOnce(mockScrapedData[0])
|
||||
.mockResolvedValueOnce(mockScrapedData[1]);
|
||||
|
||||
mockWebScraper.convertToFeedData
|
||||
.mockReturnValueOnce(mockFeedData[0])
|
||||
.mockReturnValueOnce(mockFeedData[1]);
|
||||
|
||||
mockScrapingService.processFeedBatch.mockResolvedValue(mockResults);
|
||||
|
||||
const urls = ['https://example.com/web1', 'https://example.com/web2'];
|
||||
const result = await contentScrapingService.scrapeFromWebUrls(urls, NewsSource.EL_MUNDO);
|
||||
|
||||
expect(mockWebScraper.scrapeUrl).toHaveBeenCalledTimes(2);
|
||||
expect(mockWebScraper.convertToFeedData).toHaveBeenCalledTimes(2);
|
||||
expect(mockScrapingService.processFeedBatch).toHaveBeenCalledWith(mockFeedData);
|
||||
expect(result).toEqual({
|
||||
success: 2,
|
||||
failed: 0,
|
||||
duplicates: 0,
|
||||
items: mockResults
|
||||
});
|
||||
});
|
||||
|
||||
test('should handle failed web scraping', async () => {
|
||||
mockWebScraper.scrapeUrl
|
||||
.mockResolvedValueOnce(null)
|
||||
.mockRejectedValueOnce(new Error('Scraping failed'));
|
||||
|
||||
const urls = ['https://example.com/fail1', 'https://example.com/fail2'];
|
||||
const result = await contentScrapingService.scrapeFromWebUrls(urls, NewsSource.EL_MUNDO);
|
||||
|
||||
expect(result).toEqual({
|
||||
success: 0,
|
||||
failed: 2,
|
||||
duplicates: 0,
|
||||
items: []
|
||||
});
|
||||
expect(mockScrapingService.processFeedBatch).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
describe('scrapeFromSource', () => {
|
||||
test('should scrape from web URLs', async () => {
|
||||
const config = {
|
||||
name: 'Test Source',
|
||||
source: NewsSource.EL_PAIS,
|
||||
webUrls: ['https://example.com/web1'],
|
||||
enabled: true
|
||||
};
|
||||
|
||||
const mockScrapedData = {
|
||||
title: 'Web Article',
|
||||
description: 'Web Description',
|
||||
url: 'https://example.com/web1',
|
||||
publishedAt: new Date()
|
||||
};
|
||||
|
||||
const mockWebFeedData = {
|
||||
...mockScrapedData,
|
||||
source: NewsSource.EL_PAIS,
|
||||
isManual: false
|
||||
};
|
||||
|
||||
// Mock web scraping
|
||||
mockWebScraper.scrapeUrl.mockResolvedValue(mockScrapedData);
|
||||
mockWebScraper.convertToFeedData.mockReturnValue(mockWebFeedData);
|
||||
mockScrapingService.processFeedBatch.mockResolvedValue([{ _id: '1', ...mockWebFeedData }]);
|
||||
|
||||
const result = await contentScrapingService.scrapeFromSource(config);
|
||||
|
||||
expect(result).toEqual({
|
||||
success: 1,
|
||||
failed: 0,
|
||||
duplicates: 0,
|
||||
items: [{ _id: '1', ...mockWebFeedData }]
|
||||
});
|
||||
});
|
||||
|
||||
test('should skip disabled sources', async () => {
|
||||
const config = {
|
||||
name: 'Disabled Source',
|
||||
source: NewsSource.EL_PAIS,
|
||||
webUrls: ['https://example.com/web1'],
|
||||
enabled: false
|
||||
};
|
||||
|
||||
const result = await contentScrapingService.scrapeFromSource(config);
|
||||
|
||||
expect(result).toEqual({
|
||||
success: 0,
|
||||
failed: 0,
|
||||
duplicates: 0,
|
||||
items: []
|
||||
});
|
||||
expect(mockWebScraper.scrapeUrl).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
describe('scrapeFromMultipleSources', () => {
|
||||
test('should scrape from multiple sources', async () => {
|
||||
const configs = [
|
||||
{
|
||||
name: 'Source 1',
|
||||
source: NewsSource.EL_PAIS,
|
||||
webUrls: ['https://example.com/web1'],
|
||||
enabled: true
|
||||
},
|
||||
{
|
||||
name: 'Source 2',
|
||||
source: NewsSource.EL_MUNDO,
|
||||
webUrls: ['https://example.com/web2'],
|
||||
enabled: true
|
||||
}
|
||||
];
|
||||
|
||||
const mockScrapedData1 = {
|
||||
title: 'Article 1',
|
||||
description: 'Description 1',
|
||||
url: 'https://example.com/web1',
|
||||
publishedAt: new Date()
|
||||
};
|
||||
|
||||
const mockScrapedData2 = {
|
||||
title: 'Article 2',
|
||||
description: 'Description 2',
|
||||
url: 'https://example.com/web2',
|
||||
publishedAt: new Date()
|
||||
};
|
||||
|
||||
const mockFeedData1 = { ...mockScrapedData1, source: NewsSource.EL_PAIS, isManual: false };
|
||||
const mockFeedData2 = { ...mockScrapedData2, source: NewsSource.EL_MUNDO, isManual: false };
|
||||
|
||||
mockWebScraper.scrapeUrl
|
||||
.mockResolvedValueOnce(mockScrapedData1)
|
||||
.mockResolvedValueOnce(mockScrapedData2);
|
||||
|
||||
mockWebScraper.convertToFeedData
|
||||
.mockReturnValueOnce(mockFeedData1)
|
||||
.mockReturnValueOnce(mockFeedData2);
|
||||
|
||||
mockScrapingService.processFeedBatch
|
||||
.mockResolvedValueOnce([{ _id: '1', ...mockFeedData1 }])
|
||||
.mockResolvedValueOnce([{ _id: '2', ...mockFeedData2 }]);
|
||||
|
||||
const results = await contentScrapingService.scrapeFromMultipleSources(configs);
|
||||
|
||||
expect(results.size).toBe(2);
|
||||
expect(results.get('Source 1')).toEqual({
|
||||
success: 1,
|
||||
failed: 0,
|
||||
duplicates: 0,
|
||||
items: [{ _id: '1', ...mockFeedData1 }]
|
||||
});
|
||||
expect(results.get('Source 2')).toEqual({
|
||||
success: 1,
|
||||
failed: 0,
|
||||
duplicates: 0,
|
||||
items: [{ _id: '2', ...mockFeedData2 }]
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('createNewsSourceConfigs', () => {
|
||||
test('should create default news source configurations', () => {
|
||||
const configs = ContentScrapingService.createNewsSourceConfigs();
|
||||
|
||||
expect(configs).toHaveLength(2);
|
||||
expect(configs[0]).toEqual({
|
||||
name: 'El País',
|
||||
source: NewsSource.EL_PAIS,
|
||||
enabled: true
|
||||
});
|
||||
expect(configs[1]).toEqual({
|
||||
name: 'El Mundo',
|
||||
source: NewsSource.EL_MUNDO,
|
||||
enabled: true
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
156
src/services/ContentScrapingService.ts
Normal file
156
src/services/ContentScrapingService.ts
Normal file
@ -0,0 +1,156 @@
|
||||
import { WebScraper } from '../utils/WebScraper.js';
|
||||
import { ScrapingService } from './ScrapingService.js';
|
||||
import { IFeed, NewsSource } from '../types/Feed.js';
|
||||
import { IFeedRepository } from '../repositories/FeedRepository.js';
|
||||
import { Logger } from '../utils/logger.js';
|
||||
|
||||
interface ScrapingResult {
|
||||
success: number;
|
||||
failed: number;
|
||||
duplicates: number;
|
||||
items: (IFeed | null)[];
|
||||
}
|
||||
|
||||
interface NewsSourceConfig {
|
||||
name: string;
|
||||
source: NewsSource;
|
||||
webUrls?: string[];
|
||||
enabled: boolean;
|
||||
}
|
||||
|
||||
export class ContentScrapingService {
|
||||
private webScraper: WebScraper;
|
||||
private scrapingService: ScrapingService;
|
||||
|
||||
constructor(feedRepository: IFeedRepository) {
|
||||
this.webScraper = new WebScraper();
|
||||
this.scrapingService = new ScrapingService(feedRepository);
|
||||
}
|
||||
|
||||
|
||||
|
||||
async scrapeFromWebUrls(urls: string[], source: NewsSource): Promise<ScrapingResult> {
|
||||
Logger.info(`Starting web scraping from ${urls.length} URLs for ${source}`);
|
||||
|
||||
const feedItems: Omit<IFeed, '_id' | 'createdAt' | 'updatedAt'>[] = [];
|
||||
|
||||
for (const url of urls) {
|
||||
try {
|
||||
const scrapedData = await this.webScraper.scrapeUrl(url);
|
||||
if (scrapedData) {
|
||||
const feedData = this.webScraper.convertToFeedData(scrapedData, source);
|
||||
feedItems.push(feedData);
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(`Error scraping URL ${url}:`, error);
|
||||
}
|
||||
}
|
||||
|
||||
if (feedItems.length === 0) {
|
||||
Logger.warn(`No items scraped from web URLs`);
|
||||
return { success: 0, failed: urls.length, duplicates: 0, items: [] };
|
||||
}
|
||||
|
||||
const results = await this.scrapingService.processFeedBatch(feedItems);
|
||||
return this.analyzeResults(results);
|
||||
}
|
||||
|
||||
async scrapeFromSource(config: NewsSourceConfig): Promise<ScrapingResult> {
|
||||
if (!config.enabled) {
|
||||
Logger.info(`Skipping disabled source: ${config.name}`);
|
||||
return { success: 0, failed: 0, duplicates: 0, items: [] };
|
||||
}
|
||||
|
||||
Logger.info(`Starting content scraping for source: ${config.name}`);
|
||||
|
||||
let totalResult: ScrapingResult = { success: 0, failed: 0, duplicates: 0, items: [] };
|
||||
|
||||
// Scrape from web URLs if available
|
||||
if (config.webUrls && config.webUrls.length > 0) {
|
||||
const webResult = await this.scrapeFromWebUrls(config.webUrls, config.source);
|
||||
totalResult = this.mergeResults(totalResult, webResult);
|
||||
}
|
||||
|
||||
Logger.info(`Completed scraping for ${config.name}: ${totalResult.success} success, ${totalResult.failed} failed, ${totalResult.duplicates} duplicates`);
|
||||
return totalResult;
|
||||
}
|
||||
|
||||
async scrapeFromMultipleSources(configs: NewsSourceConfig[]): Promise<Map<string, ScrapingResult>> {
|
||||
Logger.info(`Starting batch scraping from ${configs.length} sources`);
|
||||
|
||||
const results = new Map<string, ScrapingResult>();
|
||||
|
||||
for (const config of configs) {
|
||||
try {
|
||||
const result = await this.scrapeFromSource(config);
|
||||
results.set(config.name, result);
|
||||
} catch (error) {
|
||||
Logger.error(`Error scraping source ${config.name}:`, error);
|
||||
results.set(config.name, { success: 0, failed: 1, duplicates: 0, items: [] });
|
||||
}
|
||||
}
|
||||
|
||||
const totalStats = this.calculateTotalStats(results);
|
||||
Logger.info(`Batch scraping completed: ${totalStats.success} total success, ${totalStats.failed} total failed, ${totalStats.duplicates} total duplicates`);
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
private analyzeResults(results: (IFeed | null)[]): ScrapingResult {
|
||||
const success = results.filter(item => item !== null).length;
|
||||
const duplicates = results.filter(item => item === null).length;
|
||||
|
||||
return {
|
||||
success,
|
||||
failed: 0, // processFeedBatch doesn't fail individual items, it throws on repository errors
|
||||
duplicates,
|
||||
items: results
|
||||
};
|
||||
}
|
||||
|
||||
private mergeResults(result1: ScrapingResult, result2: ScrapingResult): ScrapingResult {
|
||||
return {
|
||||
success: result1.success + result2.success,
|
||||
failed: result1.failed + result2.failed,
|
||||
duplicates: result1.duplicates + result2.duplicates,
|
||||
items: [...result1.items, ...result2.items]
|
||||
};
|
||||
}
|
||||
|
||||
private calculateTotalStats(results: Map<string, ScrapingResult>): ScrapingResult {
|
||||
let totalSuccess = 0;
|
||||
let totalFailed = 0;
|
||||
let totalDuplicates = 0;
|
||||
const allItems: (IFeed | null)[] = [];
|
||||
|
||||
for (const result of results.values()) {
|
||||
totalSuccess += result.success;
|
||||
totalFailed += result.failed;
|
||||
totalDuplicates += result.duplicates;
|
||||
allItems.push(...result.items);
|
||||
}
|
||||
|
||||
return {
|
||||
success: totalSuccess,
|
||||
failed: totalFailed,
|
||||
duplicates: totalDuplicates,
|
||||
items: allItems
|
||||
};
|
||||
}
|
||||
|
||||
// Utility method to create common news source configurations
|
||||
static createNewsSourceConfigs(): NewsSourceConfig[] {
|
||||
return [
|
||||
{
|
||||
name: 'El País',
|
||||
source: NewsSource.EL_PAIS,
|
||||
enabled: true
|
||||
},
|
||||
{
|
||||
name: 'El Mundo',
|
||||
source: NewsSource.EL_MUNDO,
|
||||
enabled: true
|
||||
}
|
||||
];
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user