ContentScrapingService
This commit is contained in:
259
src/__tests__/ContentScrapingService.test.ts
Normal file
259
src/__tests__/ContentScrapingService.test.ts
Normal file
@ -0,0 +1,259 @@
|
|||||||
|
import { ContentScrapingService } from '../services/ContentScrapingService';
|
||||||
|
import { WebScraper } from '../utils/WebScraper';
|
||||||
|
import { ScrapingService } from '../services/ScrapingService';
|
||||||
|
import { IFeedRepository } from '../repositories/FeedRepository';
|
||||||
|
import { NewsSource } from '../types/Feed';
|
||||||
|
import { Logger } from '../utils/logger';
|
||||||
|
|
||||||
|
// Mock dependencies
|
||||||
|
jest.mock('../utils/WebScraper');
|
||||||
|
jest.mock('../services/ScrapingService');
|
||||||
|
jest.mock('../utils/logger');
|
||||||
|
|
||||||
|
describe('ContentScrapingService', () => {
|
||||||
|
let contentScrapingService: ContentScrapingService;
|
||||||
|
let mockFeedRepository: jest.Mocked<IFeedRepository>;
|
||||||
|
let mockWebScraper: jest.Mocked<WebScraper>;
|
||||||
|
|
||||||
|
let mockScrapingService: jest.Mocked<ScrapingService>;
|
||||||
|
|
||||||
|
beforeEach(() => {
|
||||||
|
jest.clearAllMocks();
|
||||||
|
|
||||||
|
mockFeedRepository = {
|
||||||
|
create: jest.fn(),
|
||||||
|
findAll: jest.fn(),
|
||||||
|
findById: jest.fn(),
|
||||||
|
findByUrl: jest.fn(),
|
||||||
|
findBySource: jest.fn(),
|
||||||
|
findTodaysFrontPage: jest.fn(),
|
||||||
|
update: jest.fn(),
|
||||||
|
delete: jest.fn(),
|
||||||
|
deleteMany: jest.fn(),
|
||||||
|
count: jest.fn(),
|
||||||
|
exists: jest.fn()
|
||||||
|
};
|
||||||
|
|
||||||
|
mockWebScraper = new WebScraper() as jest.Mocked<WebScraper>;
|
||||||
|
|
||||||
|
mockScrapingService = new ScrapingService(mockFeedRepository) as jest.Mocked<ScrapingService>;
|
||||||
|
|
||||||
|
// Mock constructor calls
|
||||||
|
(WebScraper as jest.MockedClass<typeof WebScraper>).mockImplementation(() => mockWebScraper);
|
||||||
|
|
||||||
|
(ScrapingService as jest.MockedClass<typeof ScrapingService>).mockImplementation(() => mockScrapingService);
|
||||||
|
|
||||||
|
contentScrapingService = new ContentScrapingService(mockFeedRepository);
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
describe('scrapeFromWebUrls', () => {
|
||||||
|
test('should successfully scrape from web URLs', async () => {
|
||||||
|
const mockScrapedData = [
|
||||||
|
{
|
||||||
|
title: 'Web Article 1',
|
||||||
|
description: 'Web Description 1',
|
||||||
|
url: 'https://example.com/web1',
|
||||||
|
publishedAt: new Date()
|
||||||
|
},
|
||||||
|
{
|
||||||
|
title: 'Web Article 2',
|
||||||
|
description: 'Web Description 2',
|
||||||
|
url: 'https://example.com/web2',
|
||||||
|
publishedAt: new Date()
|
||||||
|
}
|
||||||
|
];
|
||||||
|
|
||||||
|
const mockFeedData = mockScrapedData.map(data => ({
|
||||||
|
...data,
|
||||||
|
source: NewsSource.EL_MUNDO,
|
||||||
|
isManual: false
|
||||||
|
}));
|
||||||
|
|
||||||
|
const mockResults = [
|
||||||
|
{ _id: '1', ...mockFeedData[0] },
|
||||||
|
{ _id: '2', ...mockFeedData[1] }
|
||||||
|
];
|
||||||
|
|
||||||
|
mockWebScraper.scrapeUrl
|
||||||
|
.mockResolvedValueOnce(mockScrapedData[0])
|
||||||
|
.mockResolvedValueOnce(mockScrapedData[1]);
|
||||||
|
|
||||||
|
mockWebScraper.convertToFeedData
|
||||||
|
.mockReturnValueOnce(mockFeedData[0])
|
||||||
|
.mockReturnValueOnce(mockFeedData[1]);
|
||||||
|
|
||||||
|
mockScrapingService.processFeedBatch.mockResolvedValue(mockResults);
|
||||||
|
|
||||||
|
const urls = ['https://example.com/web1', 'https://example.com/web2'];
|
||||||
|
const result = await contentScrapingService.scrapeFromWebUrls(urls, NewsSource.EL_MUNDO);
|
||||||
|
|
||||||
|
expect(mockWebScraper.scrapeUrl).toHaveBeenCalledTimes(2);
|
||||||
|
expect(mockWebScraper.convertToFeedData).toHaveBeenCalledTimes(2);
|
||||||
|
expect(mockScrapingService.processFeedBatch).toHaveBeenCalledWith(mockFeedData);
|
||||||
|
expect(result).toEqual({
|
||||||
|
success: 2,
|
||||||
|
failed: 0,
|
||||||
|
duplicates: 0,
|
||||||
|
items: mockResults
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test('should handle failed web scraping', async () => {
|
||||||
|
mockWebScraper.scrapeUrl
|
||||||
|
.mockResolvedValueOnce(null)
|
||||||
|
.mockRejectedValueOnce(new Error('Scraping failed'));
|
||||||
|
|
||||||
|
const urls = ['https://example.com/fail1', 'https://example.com/fail2'];
|
||||||
|
const result = await contentScrapingService.scrapeFromWebUrls(urls, NewsSource.EL_MUNDO);
|
||||||
|
|
||||||
|
expect(result).toEqual({
|
||||||
|
success: 0,
|
||||||
|
failed: 2,
|
||||||
|
duplicates: 0,
|
||||||
|
items: []
|
||||||
|
});
|
||||||
|
expect(mockScrapingService.processFeedBatch).not.toHaveBeenCalled();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('scrapeFromSource', () => {
|
||||||
|
test('should scrape from web URLs', async () => {
|
||||||
|
const config = {
|
||||||
|
name: 'Test Source',
|
||||||
|
source: NewsSource.EL_PAIS,
|
||||||
|
webUrls: ['https://example.com/web1'],
|
||||||
|
enabled: true
|
||||||
|
};
|
||||||
|
|
||||||
|
const mockScrapedData = {
|
||||||
|
title: 'Web Article',
|
||||||
|
description: 'Web Description',
|
||||||
|
url: 'https://example.com/web1',
|
||||||
|
publishedAt: new Date()
|
||||||
|
};
|
||||||
|
|
||||||
|
const mockWebFeedData = {
|
||||||
|
...mockScrapedData,
|
||||||
|
source: NewsSource.EL_PAIS,
|
||||||
|
isManual: false
|
||||||
|
};
|
||||||
|
|
||||||
|
// Mock web scraping
|
||||||
|
mockWebScraper.scrapeUrl.mockResolvedValue(mockScrapedData);
|
||||||
|
mockWebScraper.convertToFeedData.mockReturnValue(mockWebFeedData);
|
||||||
|
mockScrapingService.processFeedBatch.mockResolvedValue([{ _id: '1', ...mockWebFeedData }]);
|
||||||
|
|
||||||
|
const result = await contentScrapingService.scrapeFromSource(config);
|
||||||
|
|
||||||
|
expect(result).toEqual({
|
||||||
|
success: 1,
|
||||||
|
failed: 0,
|
||||||
|
duplicates: 0,
|
||||||
|
items: [{ _id: '1', ...mockWebFeedData }]
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test('should skip disabled sources', async () => {
|
||||||
|
const config = {
|
||||||
|
name: 'Disabled Source',
|
||||||
|
source: NewsSource.EL_PAIS,
|
||||||
|
webUrls: ['https://example.com/web1'],
|
||||||
|
enabled: false
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await contentScrapingService.scrapeFromSource(config);
|
||||||
|
|
||||||
|
expect(result).toEqual({
|
||||||
|
success: 0,
|
||||||
|
failed: 0,
|
||||||
|
duplicates: 0,
|
||||||
|
items: []
|
||||||
|
});
|
||||||
|
expect(mockWebScraper.scrapeUrl).not.toHaveBeenCalled();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('scrapeFromMultipleSources', () => {
|
||||||
|
test('should scrape from multiple sources', async () => {
|
||||||
|
const configs = [
|
||||||
|
{
|
||||||
|
name: 'Source 1',
|
||||||
|
source: NewsSource.EL_PAIS,
|
||||||
|
webUrls: ['https://example.com/web1'],
|
||||||
|
enabled: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'Source 2',
|
||||||
|
source: NewsSource.EL_MUNDO,
|
||||||
|
webUrls: ['https://example.com/web2'],
|
||||||
|
enabled: true
|
||||||
|
}
|
||||||
|
];
|
||||||
|
|
||||||
|
const mockScrapedData1 = {
|
||||||
|
title: 'Article 1',
|
||||||
|
description: 'Description 1',
|
||||||
|
url: 'https://example.com/web1',
|
||||||
|
publishedAt: new Date()
|
||||||
|
};
|
||||||
|
|
||||||
|
const mockScrapedData2 = {
|
||||||
|
title: 'Article 2',
|
||||||
|
description: 'Description 2',
|
||||||
|
url: 'https://example.com/web2',
|
||||||
|
publishedAt: new Date()
|
||||||
|
};
|
||||||
|
|
||||||
|
const mockFeedData1 = { ...mockScrapedData1, source: NewsSource.EL_PAIS, isManual: false };
|
||||||
|
const mockFeedData2 = { ...mockScrapedData2, source: NewsSource.EL_MUNDO, isManual: false };
|
||||||
|
|
||||||
|
mockWebScraper.scrapeUrl
|
||||||
|
.mockResolvedValueOnce(mockScrapedData1)
|
||||||
|
.mockResolvedValueOnce(mockScrapedData2);
|
||||||
|
|
||||||
|
mockWebScraper.convertToFeedData
|
||||||
|
.mockReturnValueOnce(mockFeedData1)
|
||||||
|
.mockReturnValueOnce(mockFeedData2);
|
||||||
|
|
||||||
|
mockScrapingService.processFeedBatch
|
||||||
|
.mockResolvedValueOnce([{ _id: '1', ...mockFeedData1 }])
|
||||||
|
.mockResolvedValueOnce([{ _id: '2', ...mockFeedData2 }]);
|
||||||
|
|
||||||
|
const results = await contentScrapingService.scrapeFromMultipleSources(configs);
|
||||||
|
|
||||||
|
expect(results.size).toBe(2);
|
||||||
|
expect(results.get('Source 1')).toEqual({
|
||||||
|
success: 1,
|
||||||
|
failed: 0,
|
||||||
|
duplicates: 0,
|
||||||
|
items: [{ _id: '1', ...mockFeedData1 }]
|
||||||
|
});
|
||||||
|
expect(results.get('Source 2')).toEqual({
|
||||||
|
success: 1,
|
||||||
|
failed: 0,
|
||||||
|
duplicates: 0,
|
||||||
|
items: [{ _id: '2', ...mockFeedData2 }]
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('createNewsSourceConfigs', () => {
|
||||||
|
test('should create default news source configurations', () => {
|
||||||
|
const configs = ContentScrapingService.createNewsSourceConfigs();
|
||||||
|
|
||||||
|
expect(configs).toHaveLength(2);
|
||||||
|
expect(configs[0]).toEqual({
|
||||||
|
name: 'El País',
|
||||||
|
source: NewsSource.EL_PAIS,
|
||||||
|
enabled: true
|
||||||
|
});
|
||||||
|
expect(configs[1]).toEqual({
|
||||||
|
name: 'El Mundo',
|
||||||
|
source: NewsSource.EL_MUNDO,
|
||||||
|
enabled: true
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
156
src/services/ContentScrapingService.ts
Normal file
156
src/services/ContentScrapingService.ts
Normal file
@ -0,0 +1,156 @@
|
|||||||
|
import { WebScraper } from '../utils/WebScraper.js';
|
||||||
|
import { ScrapingService } from './ScrapingService.js';
|
||||||
|
import { IFeed, NewsSource } from '../types/Feed.js';
|
||||||
|
import { IFeedRepository } from '../repositories/FeedRepository.js';
|
||||||
|
import { Logger } from '../utils/logger.js';
|
||||||
|
|
||||||
|
interface ScrapingResult {
|
||||||
|
success: number;
|
||||||
|
failed: number;
|
||||||
|
duplicates: number;
|
||||||
|
items: (IFeed | null)[];
|
||||||
|
}
|
||||||
|
|
||||||
|
interface NewsSourceConfig {
|
||||||
|
name: string;
|
||||||
|
source: NewsSource;
|
||||||
|
webUrls?: string[];
|
||||||
|
enabled: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export class ContentScrapingService {
|
||||||
|
private webScraper: WebScraper;
|
||||||
|
private scrapingService: ScrapingService;
|
||||||
|
|
||||||
|
constructor(feedRepository: IFeedRepository) {
|
||||||
|
this.webScraper = new WebScraper();
|
||||||
|
this.scrapingService = new ScrapingService(feedRepository);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
async scrapeFromWebUrls(urls: string[], source: NewsSource): Promise<ScrapingResult> {
|
||||||
|
Logger.info(`Starting web scraping from ${urls.length} URLs for ${source}`);
|
||||||
|
|
||||||
|
const feedItems: Omit<IFeed, '_id' | 'createdAt' | 'updatedAt'>[] = [];
|
||||||
|
|
||||||
|
for (const url of urls) {
|
||||||
|
try {
|
||||||
|
const scrapedData = await this.webScraper.scrapeUrl(url);
|
||||||
|
if (scrapedData) {
|
||||||
|
const feedData = this.webScraper.convertToFeedData(scrapedData, source);
|
||||||
|
feedItems.push(feedData);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
Logger.error(`Error scraping URL ${url}:`, error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (feedItems.length === 0) {
|
||||||
|
Logger.warn(`No items scraped from web URLs`);
|
||||||
|
return { success: 0, failed: urls.length, duplicates: 0, items: [] };
|
||||||
|
}
|
||||||
|
|
||||||
|
const results = await this.scrapingService.processFeedBatch(feedItems);
|
||||||
|
return this.analyzeResults(results);
|
||||||
|
}
|
||||||
|
|
||||||
|
async scrapeFromSource(config: NewsSourceConfig): Promise<ScrapingResult> {
|
||||||
|
if (!config.enabled) {
|
||||||
|
Logger.info(`Skipping disabled source: ${config.name}`);
|
||||||
|
return { success: 0, failed: 0, duplicates: 0, items: [] };
|
||||||
|
}
|
||||||
|
|
||||||
|
Logger.info(`Starting content scraping for source: ${config.name}`);
|
||||||
|
|
||||||
|
let totalResult: ScrapingResult = { success: 0, failed: 0, duplicates: 0, items: [] };
|
||||||
|
|
||||||
|
// Scrape from web URLs if available
|
||||||
|
if (config.webUrls && config.webUrls.length > 0) {
|
||||||
|
const webResult = await this.scrapeFromWebUrls(config.webUrls, config.source);
|
||||||
|
totalResult = this.mergeResults(totalResult, webResult);
|
||||||
|
}
|
||||||
|
|
||||||
|
Logger.info(`Completed scraping for ${config.name}: ${totalResult.success} success, ${totalResult.failed} failed, ${totalResult.duplicates} duplicates`);
|
||||||
|
return totalResult;
|
||||||
|
}
|
||||||
|
|
||||||
|
async scrapeFromMultipleSources(configs: NewsSourceConfig[]): Promise<Map<string, ScrapingResult>> {
|
||||||
|
Logger.info(`Starting batch scraping from ${configs.length} sources`);
|
||||||
|
|
||||||
|
const results = new Map<string, ScrapingResult>();
|
||||||
|
|
||||||
|
for (const config of configs) {
|
||||||
|
try {
|
||||||
|
const result = await this.scrapeFromSource(config);
|
||||||
|
results.set(config.name, result);
|
||||||
|
} catch (error) {
|
||||||
|
Logger.error(`Error scraping source ${config.name}:`, error);
|
||||||
|
results.set(config.name, { success: 0, failed: 1, duplicates: 0, items: [] });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const totalStats = this.calculateTotalStats(results);
|
||||||
|
Logger.info(`Batch scraping completed: ${totalStats.success} total success, ${totalStats.failed} total failed, ${totalStats.duplicates} total duplicates`);
|
||||||
|
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
private analyzeResults(results: (IFeed | null)[]): ScrapingResult {
|
||||||
|
const success = results.filter(item => item !== null).length;
|
||||||
|
const duplicates = results.filter(item => item === null).length;
|
||||||
|
|
||||||
|
return {
|
||||||
|
success,
|
||||||
|
failed: 0, // processFeedBatch doesn't fail individual items, it throws on repository errors
|
||||||
|
duplicates,
|
||||||
|
items: results
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private mergeResults(result1: ScrapingResult, result2: ScrapingResult): ScrapingResult {
|
||||||
|
return {
|
||||||
|
success: result1.success + result2.success,
|
||||||
|
failed: result1.failed + result2.failed,
|
||||||
|
duplicates: result1.duplicates + result2.duplicates,
|
||||||
|
items: [...result1.items, ...result2.items]
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private calculateTotalStats(results: Map<string, ScrapingResult>): ScrapingResult {
|
||||||
|
let totalSuccess = 0;
|
||||||
|
let totalFailed = 0;
|
||||||
|
let totalDuplicates = 0;
|
||||||
|
const allItems: (IFeed | null)[] = [];
|
||||||
|
|
||||||
|
for (const result of results.values()) {
|
||||||
|
totalSuccess += result.success;
|
||||||
|
totalFailed += result.failed;
|
||||||
|
totalDuplicates += result.duplicates;
|
||||||
|
allItems.push(...result.items);
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
success: totalSuccess,
|
||||||
|
failed: totalFailed,
|
||||||
|
duplicates: totalDuplicates,
|
||||||
|
items: allItems
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Utility method to create common news source configurations
|
||||||
|
static createNewsSourceConfigs(): NewsSourceConfig[] {
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
name: 'El País',
|
||||||
|
source: NewsSource.EL_PAIS,
|
||||||
|
enabled: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'El Mundo',
|
||||||
|
source: NewsSource.EL_MUNDO,
|
||||||
|
enabled: true
|
||||||
|
}
|
||||||
|
];
|
||||||
|
}
|
||||||
|
}
|
Reference in New Issue
Block a user