1
.gitignore
vendored
1
.gitignore
vendored
@ -1,3 +1,4 @@
|
||||
node_modules
|
||||
dist
|
||||
*.bk
|
||||
.DS_Store
|
44
README.md
44
README.md
@ -51,6 +51,9 @@
|
||||
- implement endpoints and their tests
|
||||
- troubleshooting: update jest.config and tsconfig to allow test use dependencies
|
||||
|
||||
- Fourth part: [#6 PR : feat/scraper](https://github.com/aabril/dailytrends/pull/6)
|
||||
- Crea un “servicio de lectura de feeds” que extraiga por web scraping
|
||||
- we are going to be implementing a Factory for the scraper, since we are going to input values and then will build our custom class
|
||||
|
||||
## Feed layer abstractions
|
||||
|
||||
@ -114,3 +117,44 @@ EXPOSE 3000
|
||||
CMD ["node", "dist/index.js"]
|
||||
|
||||
```
|
||||
|
||||
|
||||
### Scraper OOP
|
||||
|
||||
#### Entrypoint
|
||||
- `scraper.ts` - Application entry point that initializes the scraping system
|
||||
|
||||
#### Core Services
|
||||
- `ScrapingScheduler.ts` - Orchestrates scraping cycles and timing
|
||||
- `ContentScrapingService.ts` - Handles web content scraping logic
|
||||
- `FeedReaderService.ts` - Manages newspaper extraction
|
||||
- `ScrapingService.ts` - Base scraping functionality
|
||||
|
||||
#### Utilities
|
||||
- `WebScraper.ts` - HTML parsing and data extraction utility
|
||||
- `logger.ts` - Logging utility
|
||||
|
||||
#### Extractors
|
||||
- `BaseNewspaperExtractor.ts` - clase Abstract Base
|
||||
- `ElPaisExtractor.ts` - especificación / extractor para El País
|
||||
- `ElMundoExtractor.ts` - especificación / extractor para El Mundo
|
||||
- `NewspaperExtractorFactory.ts` - clase Factory para crear extractors
|
||||
|
||||
#### Types & Interfaces
|
||||
- `Feed.ts` - tipos y interfaces
|
||||
- `NewspaperTypes.ts` - configuración de las interfaces
|
||||
- `FeedRepository.ts` - abstracción interfaz de la base de datos
|
||||
|
||||
## Propiedades de OOP
|
||||
|
||||
- He intentado seguir las propiedades de OOP. Ejemplo:
|
||||
- separación de responsabilidades: con las capas de abstracción, y servicios dedicados
|
||||
- Factory de los extractors en NewspaperExtractorFactory, básicamente, patrón de diseño que nos ayuda a crear objetos de una clase específica, basados en ciertos parámetros, y así lo adaptamos a nuestros periodicos favoritos.
|
||||
- Herencia, desde BaseNewspaperExtractor a los extractors.
|
||||
- Utils, para tener DRY y poder usarlo desde diferentes classes.
|
||||
- He intentando poner tests donde sea necesario, y de forma que tenga sentido.
|
||||
|
||||
|
||||
Obviamente cualquier propuesta está siempre abierta a debate y a mejoras.
|
||||
En mi caso, y dentro de las limitaciones, he intentado seguir las instrucciones y ver como lo podemos adaptar.
|
||||
Seguramente con más tiempo se puede simplificar más sin perder funcionalidades.
|
@ -18,6 +18,8 @@
|
||||
"build": "tsc",
|
||||
"start": "node dist/server.js",
|
||||
"dev": "tsx watch src/server.ts",
|
||||
"scraper": "node dist/scraper.js",
|
||||
"scraper:dev": "tsx watch src/scraper.ts",
|
||||
"test": "jest",
|
||||
"test:watch": "jest --watch",
|
||||
"lint": "eslint src/**/*.ts",
|
||||
|
259
src/__tests__/ContentScrapingService.test.ts
Normal file
259
src/__tests__/ContentScrapingService.test.ts
Normal file
@ -0,0 +1,259 @@
|
||||
import { ContentScrapingService } from '../services/ContentScrapingService';
|
||||
import { WebScraper } from '../utils/WebScraper';
|
||||
import { ScrapingService } from '../services/ScrapingService';
|
||||
import { IFeedRepository } from '../repositories/FeedRepository';
|
||||
import { NewsSource } from '../types/Feed';
|
||||
import { Logger } from '../utils/logger';
|
||||
|
||||
// Mock dependencies
|
||||
jest.mock('../utils/WebScraper');
|
||||
jest.mock('../services/ScrapingService');
|
||||
jest.mock('../utils/logger');
|
||||
|
||||
describe('ContentScrapingService', () => {
|
||||
let contentScrapingService: ContentScrapingService;
|
||||
let mockFeedRepository: jest.Mocked<IFeedRepository>;
|
||||
let mockWebScraper: jest.Mocked<WebScraper>;
|
||||
|
||||
let mockScrapingService: jest.Mocked<ScrapingService>;
|
||||
|
||||
beforeEach(() => {
|
||||
jest.clearAllMocks();
|
||||
|
||||
mockFeedRepository = {
|
||||
create: jest.fn(),
|
||||
findAll: jest.fn(),
|
||||
findById: jest.fn(),
|
||||
findByUrl: jest.fn(),
|
||||
findBySource: jest.fn(),
|
||||
findTodaysFrontPage: jest.fn(),
|
||||
update: jest.fn(),
|
||||
delete: jest.fn(),
|
||||
deleteMany: jest.fn(),
|
||||
count: jest.fn(),
|
||||
exists: jest.fn()
|
||||
};
|
||||
|
||||
mockWebScraper = new WebScraper() as jest.Mocked<WebScraper>;
|
||||
|
||||
mockScrapingService = new ScrapingService(mockFeedRepository) as jest.Mocked<ScrapingService>;
|
||||
|
||||
// Mock constructor calls
|
||||
(WebScraper as jest.MockedClass<typeof WebScraper>).mockImplementation(() => mockWebScraper);
|
||||
|
||||
(ScrapingService as jest.MockedClass<typeof ScrapingService>).mockImplementation(() => mockScrapingService);
|
||||
|
||||
contentScrapingService = new ContentScrapingService(mockFeedRepository);
|
||||
});
|
||||
|
||||
|
||||
|
||||
describe('scrapeFromWebUrls', () => {
|
||||
test('should successfully scrape from web URLs', async () => {
|
||||
const mockScrapedData = [
|
||||
{
|
||||
title: 'Web Article 1',
|
||||
description: 'Web Description 1',
|
||||
url: 'https://example.com/web1',
|
||||
publishedAt: new Date()
|
||||
},
|
||||
{
|
||||
title: 'Web Article 2',
|
||||
description: 'Web Description 2',
|
||||
url: 'https://example.com/web2',
|
||||
publishedAt: new Date()
|
||||
}
|
||||
];
|
||||
|
||||
const mockFeedData = mockScrapedData.map(data => ({
|
||||
...data,
|
||||
source: NewsSource.EL_MUNDO,
|
||||
isManual: false
|
||||
}));
|
||||
|
||||
const mockResults = [
|
||||
{ _id: '1', ...mockFeedData[0] },
|
||||
{ _id: '2', ...mockFeedData[1] }
|
||||
];
|
||||
|
||||
mockWebScraper.scrapeUrl
|
||||
.mockResolvedValueOnce(mockScrapedData[0])
|
||||
.mockResolvedValueOnce(mockScrapedData[1]);
|
||||
|
||||
mockWebScraper.convertToFeedData
|
||||
.mockReturnValueOnce(mockFeedData[0])
|
||||
.mockReturnValueOnce(mockFeedData[1]);
|
||||
|
||||
mockScrapingService.processFeedBatch.mockResolvedValue(mockResults);
|
||||
|
||||
const urls = ['https://example.com/web1', 'https://example.com/web2'];
|
||||
const result = await contentScrapingService.scrapeFromWebUrls(urls, NewsSource.EL_MUNDO);
|
||||
|
||||
expect(mockWebScraper.scrapeUrl).toHaveBeenCalledTimes(2);
|
||||
expect(mockWebScraper.convertToFeedData).toHaveBeenCalledTimes(2);
|
||||
expect(mockScrapingService.processFeedBatch).toHaveBeenCalledWith(mockFeedData);
|
||||
expect(result).toEqual({
|
||||
success: 2,
|
||||
failed: 0,
|
||||
duplicates: 0,
|
||||
items: mockResults
|
||||
});
|
||||
});
|
||||
|
||||
test('should handle failed web scraping', async () => {
|
||||
mockWebScraper.scrapeUrl
|
||||
.mockResolvedValueOnce(null)
|
||||
.mockRejectedValueOnce(new Error('Scraping failed'));
|
||||
|
||||
const urls = ['https://example.com/fail1', 'https://example.com/fail2'];
|
||||
const result = await contentScrapingService.scrapeFromWebUrls(urls, NewsSource.EL_MUNDO);
|
||||
|
||||
expect(result).toEqual({
|
||||
success: 0,
|
||||
failed: 2,
|
||||
duplicates: 0,
|
||||
items: []
|
||||
});
|
||||
expect(mockScrapingService.processFeedBatch).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
describe('scrapeFromSource', () => {
|
||||
test('should scrape from web URLs', async () => {
|
||||
const config = {
|
||||
name: 'Test Source',
|
||||
source: NewsSource.EL_PAIS,
|
||||
webUrls: ['https://example.com/web1'],
|
||||
enabled: true
|
||||
};
|
||||
|
||||
const mockScrapedData = {
|
||||
title: 'Web Article',
|
||||
description: 'Web Description',
|
||||
url: 'https://example.com/web1',
|
||||
publishedAt: new Date()
|
||||
};
|
||||
|
||||
const mockWebFeedData = {
|
||||
...mockScrapedData,
|
||||
source: NewsSource.EL_PAIS,
|
||||
isManual: false
|
||||
};
|
||||
|
||||
// Mock web scraping
|
||||
mockWebScraper.scrapeUrl.mockResolvedValue(mockScrapedData);
|
||||
mockWebScraper.convertToFeedData.mockReturnValue(mockWebFeedData);
|
||||
mockScrapingService.processFeedBatch.mockResolvedValue([{ _id: '1', ...mockWebFeedData }]);
|
||||
|
||||
const result = await contentScrapingService.scrapeFromSource(config);
|
||||
|
||||
expect(result).toEqual({
|
||||
success: 1,
|
||||
failed: 0,
|
||||
duplicates: 0,
|
||||
items: [{ _id: '1', ...mockWebFeedData }]
|
||||
});
|
||||
});
|
||||
|
||||
test('should skip disabled sources', async () => {
|
||||
const config = {
|
||||
name: 'Disabled Source',
|
||||
source: NewsSource.EL_PAIS,
|
||||
webUrls: ['https://example.com/web1'],
|
||||
enabled: false
|
||||
};
|
||||
|
||||
const result = await contentScrapingService.scrapeFromSource(config);
|
||||
|
||||
expect(result).toEqual({
|
||||
success: 0,
|
||||
failed: 0,
|
||||
duplicates: 0,
|
||||
items: []
|
||||
});
|
||||
expect(mockWebScraper.scrapeUrl).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
describe('scrapeFromMultipleSources', () => {
|
||||
test('should scrape from multiple sources', async () => {
|
||||
const configs = [
|
||||
{
|
||||
name: 'Source 1',
|
||||
source: NewsSource.EL_PAIS,
|
||||
webUrls: ['https://example.com/web1'],
|
||||
enabled: true
|
||||
},
|
||||
{
|
||||
name: 'Source 2',
|
||||
source: NewsSource.EL_MUNDO,
|
||||
webUrls: ['https://example.com/web2'],
|
||||
enabled: true
|
||||
}
|
||||
];
|
||||
|
||||
const mockScrapedData1 = {
|
||||
title: 'Article 1',
|
||||
description: 'Description 1',
|
||||
url: 'https://example.com/web1',
|
||||
publishedAt: new Date()
|
||||
};
|
||||
|
||||
const mockScrapedData2 = {
|
||||
title: 'Article 2',
|
||||
description: 'Description 2',
|
||||
url: 'https://example.com/web2',
|
||||
publishedAt: new Date()
|
||||
};
|
||||
|
||||
const mockFeedData1 = { ...mockScrapedData1, source: NewsSource.EL_PAIS, isManual: false };
|
||||
const mockFeedData2 = { ...mockScrapedData2, source: NewsSource.EL_MUNDO, isManual: false };
|
||||
|
||||
mockWebScraper.scrapeUrl
|
||||
.mockResolvedValueOnce(mockScrapedData1)
|
||||
.mockResolvedValueOnce(mockScrapedData2);
|
||||
|
||||
mockWebScraper.convertToFeedData
|
||||
.mockReturnValueOnce(mockFeedData1)
|
||||
.mockReturnValueOnce(mockFeedData2);
|
||||
|
||||
mockScrapingService.processFeedBatch
|
||||
.mockResolvedValueOnce([{ _id: '1', ...mockFeedData1 }])
|
||||
.mockResolvedValueOnce([{ _id: '2', ...mockFeedData2 }]);
|
||||
|
||||
const results = await contentScrapingService.scrapeFromMultipleSources(configs);
|
||||
|
||||
expect(results.size).toBe(2);
|
||||
expect(results.get('Source 1')).toEqual({
|
||||
success: 1,
|
||||
failed: 0,
|
||||
duplicates: 0,
|
||||
items: [{ _id: '1', ...mockFeedData1 }]
|
||||
});
|
||||
expect(results.get('Source 2')).toEqual({
|
||||
success: 1,
|
||||
failed: 0,
|
||||
duplicates: 0,
|
||||
items: [{ _id: '2', ...mockFeedData2 }]
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('createNewsSourceConfigs', () => {
|
||||
test('should create default news source configurations', () => {
|
||||
const configs = ContentScrapingService.createNewsSourceConfigs();
|
||||
|
||||
expect(configs).toHaveLength(2);
|
||||
expect(configs[0]).toEqual({
|
||||
name: 'El País',
|
||||
source: NewsSource.EL_PAIS,
|
||||
enabled: true
|
||||
});
|
||||
expect(configs[1]).toEqual({
|
||||
name: 'El Mundo',
|
||||
source: NewsSource.EL_MUNDO,
|
||||
enabled: true
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
108
src/__tests__/FeedReaderService.test.ts
Normal file
108
src/__tests__/FeedReaderService.test.ts
Normal file
@ -0,0 +1,108 @@
|
||||
import { FeedReaderService } from '../services/FeedReaderService';
|
||||
import { IFeedRepository } from '../repositories/FeedRepository';
|
||||
import { NewsSource } from '../types/Feed';
|
||||
|
||||
// Mock dependencies
|
||||
jest.mock('../utils/logger');
|
||||
jest.mock('../services/ScrapingService');
|
||||
jest.mock('../utils/WebScraper');
|
||||
jest.mock('../extractors/ElPaisExtractor');
|
||||
jest.mock('../extractors/ElMundoExtractor');
|
||||
|
||||
// Mock fetch globally
|
||||
global.fetch = jest.fn();
|
||||
|
||||
const mockFeedRepository: jest.Mocked<IFeedRepository> = {
|
||||
create: jest.fn(),
|
||||
findAll: jest.fn(),
|
||||
findById: jest.fn(),
|
||||
findByUrl: jest.fn(),
|
||||
update: jest.fn(),
|
||||
delete: jest.fn(),
|
||||
findBySource: jest.fn(),
|
||||
findTodaysFrontPage: jest.fn(),
|
||||
deleteMany: jest.fn(),
|
||||
count: jest.fn(),
|
||||
exists: jest.fn()
|
||||
};
|
||||
|
||||
// Mock ScrapingService
|
||||
const mockScrapingService = {
|
||||
processFeedBatch: jest.fn()
|
||||
};
|
||||
|
||||
jest.mock('../services/ScrapingService', () => {
|
||||
return {
|
||||
ScrapingService: jest.fn().mockImplementation(() => mockScrapingService)
|
||||
};
|
||||
});
|
||||
|
||||
// Mock WebScraper
|
||||
const mockWebScraper = {
|
||||
scrapeUrl: jest.fn(),
|
||||
convertToFeedData: jest.fn()
|
||||
};
|
||||
|
||||
jest.mock('../utils/WebScraper', () => {
|
||||
return {
|
||||
WebScraper: jest.fn().mockImplementation(() => mockWebScraper)
|
||||
};
|
||||
});
|
||||
|
||||
// Mock extractors
|
||||
const mockExtractor = {
|
||||
extractNews: jest.fn(),
|
||||
isEnabled: jest.fn().mockReturnValue(true),
|
||||
getName: jest.fn(),
|
||||
getSource: jest.fn()
|
||||
};
|
||||
|
||||
const mockElPaisExtractor = {
|
||||
...mockExtractor,
|
||||
getName: jest.fn().mockReturnValue('El País'),
|
||||
getSource: jest.fn().mockReturnValue(NewsSource.EL_PAIS)
|
||||
};
|
||||
|
||||
const mockElMundoExtractor = {
|
||||
...mockExtractor,
|
||||
getName: jest.fn().mockReturnValue('El Mundo'),
|
||||
getSource: jest.fn().mockReturnValue(NewsSource.EL_MUNDO)
|
||||
};
|
||||
|
||||
jest.mock('../extractors/NewspaperExtractorFactory', () => ({
|
||||
NewspaperExtractorFactory: {
|
||||
getAllAvailableExtractors: jest.fn(() => [mockElPaisExtractor, mockElMundoExtractor]),
|
||||
createExtractor: jest.fn((source) => {
|
||||
if (source === NewsSource.EL_PAIS) return mockElPaisExtractor;
|
||||
if (source === NewsSource.EL_MUNDO) return mockElMundoExtractor;
|
||||
return null;
|
||||
})
|
||||
}
|
||||
}));
|
||||
|
||||
describe('FeedReaderService', () => {
|
||||
let feedReaderService: FeedReaderService;
|
||||
const mockFetch = fetch as jest.MockedFunction<typeof fetch>;
|
||||
|
||||
beforeEach(() => {
|
||||
jest.clearAllMocks();
|
||||
feedReaderService = new FeedReaderService(mockFeedRepository);
|
||||
});
|
||||
|
||||
describe('Constructor and Initialization', () => {
|
||||
it('should initialize with available extractors', () => {
|
||||
const newspapers = feedReaderService.getAvailableNewspapers();
|
||||
expect(newspapers).toHaveLength(2);
|
||||
expect(newspapers.map(n => n.source)).toContain(NewsSource.EL_PAIS);
|
||||
expect(newspapers.map(n => n.source)).toContain(NewsSource.EL_MUNDO);
|
||||
});
|
||||
|
||||
it('should have all extractors enabled by default', () => {
|
||||
const newspapers = feedReaderService.getAvailableNewspapers();
|
||||
newspapers.forEach(newspaper => {
|
||||
expect(newspaper.enabled).toBe(true);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
});
|
317
src/__tests__/ScrapingScheduler.test.ts
Normal file
317
src/__tests__/ScrapingScheduler.test.ts
Normal file
@ -0,0 +1,317 @@
|
||||
import { ScrapingScheduler } from '../services/ScrapingScheduler';
|
||||
import { ContentScrapingService } from '../services/ContentScrapingService';
|
||||
import { IFeedRepository } from '../repositories/FeedRepository';
|
||||
import { NewsSource } from '../types/Feed';
|
||||
|
||||
// Mock dependencies
|
||||
jest.mock('../services/ContentScrapingService');
|
||||
jest.useFakeTimers();
|
||||
|
||||
describe('ScrapingScheduler', () => {
|
||||
let scrapingScheduler: ScrapingScheduler;
|
||||
let mockFeedRepository: jest.Mocked<IFeedRepository>;
|
||||
let mockContentScrapingService: jest.Mocked<ContentScrapingService>;
|
||||
|
||||
beforeEach(() => {
|
||||
jest.clearAllMocks();
|
||||
jest.clearAllTimers();
|
||||
|
||||
mockFeedRepository = {
|
||||
create: jest.fn(),
|
||||
findAll: jest.fn(),
|
||||
findById: jest.fn(),
|
||||
findByUrl: jest.fn(),
|
||||
findBySource: jest.fn(),
|
||||
findTodaysFrontPage: jest.fn(),
|
||||
update: jest.fn(),
|
||||
delete: jest.fn(),
|
||||
deleteMany: jest.fn(),
|
||||
count: jest.fn(),
|
||||
exists: jest.fn()
|
||||
};
|
||||
|
||||
mockContentScrapingService = {
|
||||
scrapeFromMultipleSources: jest.fn(),
|
||||
|
||||
scrapeFromWebUrls: jest.fn(),
|
||||
scrapeFromSource: jest.fn()
|
||||
} as unknown as jest.Mocked<ContentScrapingService>;
|
||||
|
||||
// Mock ContentScrapingService constructor
|
||||
(ContentScrapingService as jest.MockedClass<typeof ContentScrapingService>)
|
||||
.mockImplementation(() => mockContentScrapingService);
|
||||
|
||||
// Mock static method
|
||||
(ContentScrapingService.createNewsSourceConfigs as jest.Mock) = jest.fn().mockReturnValue([
|
||||
{
|
||||
name: 'El País',
|
||||
source: NewsSource.EL_PAIS,
|
||||
webUrls: ['https://elpais.com'],
|
||||
enabled: true
|
||||
},
|
||||
{
|
||||
name: 'El Mundo',
|
||||
source: NewsSource.EL_MUNDO,
|
||||
webUrls: ['https://elmundo.es'],
|
||||
enabled: true
|
||||
}
|
||||
]);
|
||||
|
||||
scrapingScheduler = new ScrapingScheduler(mockFeedRepository, {
|
||||
intervalMinutes: 1, // 1 minute for testing
|
||||
maxRetries: 2,
|
||||
retryDelayMinutes: 1,
|
||||
enabled: true
|
||||
});
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
scrapingScheduler.stop();
|
||||
});
|
||||
|
||||
describe('Basic Functionality', () => {
|
||||
test('should create ScrapingScheduler instance with default config', () => {
|
||||
const defaultScheduler = new ScrapingScheduler(mockFeedRepository);
|
||||
const config = defaultScheduler.getConfig();
|
||||
|
||||
expect(config).toEqual({
|
||||
intervalMinutes: 30,
|
||||
maxRetries: 3,
|
||||
retryDelayMinutes: 5,
|
||||
enabled: true
|
||||
});
|
||||
});
|
||||
|
||||
test('should create ScrapingScheduler instance with custom config', () => {
|
||||
const customConfig = {
|
||||
intervalMinutes: 15,
|
||||
maxRetries: 5,
|
||||
retryDelayMinutes: 2,
|
||||
enabled: false
|
||||
};
|
||||
|
||||
const customScheduler = new ScrapingScheduler(mockFeedRepository, customConfig);
|
||||
const config = customScheduler.getConfig();
|
||||
|
||||
expect(config).toEqual(customConfig);
|
||||
});
|
||||
|
||||
test('should initialize with empty stats', () => {
|
||||
const stats = scrapingScheduler.getStats();
|
||||
|
||||
expect(stats).toEqual({
|
||||
lastRun: null,
|
||||
nextRun: null,
|
||||
totalRuns: 0,
|
||||
successfulRuns: 0,
|
||||
failedRuns: 0,
|
||||
totalItemsScraped: 0,
|
||||
totalDuplicates: 0
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('Scheduler Control', () => {
|
||||
test('should start and stop scheduler', () => {
|
||||
expect(scrapingScheduler.isSchedulerRunning()).toBe(false);
|
||||
|
||||
scrapingScheduler.start();
|
||||
expect(scrapingScheduler.isSchedulerRunning()).toBe(true);
|
||||
|
||||
scrapingScheduler.stop();
|
||||
expect(scrapingScheduler.isSchedulerRunning()).toBe(false);
|
||||
});
|
||||
|
||||
test('should not start if already running', () => {
|
||||
scrapingScheduler.start();
|
||||
const firstStart = scrapingScheduler.isSchedulerRunning();
|
||||
|
||||
scrapingScheduler.start(); // Try to start again
|
||||
const secondStart = scrapingScheduler.isSchedulerRunning();
|
||||
|
||||
expect(firstStart).toBe(true);
|
||||
expect(secondStart).toBe(true);
|
||||
expect(jest.getTimerCount()).toBe(1); // Only one timer should be active
|
||||
});
|
||||
|
||||
test('should not start if disabled', () => {
|
||||
const disabledScheduler = new ScrapingScheduler(mockFeedRepository, { enabled: false });
|
||||
|
||||
disabledScheduler.start();
|
||||
expect(disabledScheduler.isSchedulerRunning()).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe('Scraping Cycle', () => {
|
||||
test('should run successful scraping cycle', async () => {
|
||||
const mockResults = new Map([
|
||||
['El País', { success: 5, failed: 0, duplicates: 2, items: [] }],
|
||||
['El Mundo', { success: 3, failed: 0, duplicates: 1, items: [] }]
|
||||
]);
|
||||
|
||||
mockContentScrapingService.scrapeFromMultipleSources.mockResolvedValue(mockResults);
|
||||
|
||||
await scrapingScheduler.runScrapingCycle();
|
||||
|
||||
const stats = scrapingScheduler.getStats();
|
||||
expect(stats.totalRuns).toBe(1);
|
||||
expect(stats.successfulRuns).toBe(1);
|
||||
expect(stats.failedRuns).toBe(0);
|
||||
expect(stats.totalItemsScraped).toBe(8); // 5 + 3
|
||||
expect(stats.totalDuplicates).toBe(3); // 2 + 1
|
||||
expect(stats.lastRun).toBeInstanceOf(Date);
|
||||
});
|
||||
|
||||
test.skip('should handle scraping cycle errors with retries', async () => {
|
||||
mockContentScrapingService.scrapeFromMultipleSources
|
||||
.mockRejectedValueOnce(new Error('First attempt failed'))
|
||||
.mockRejectedValueOnce(new Error('Second attempt failed'))
|
||||
.mockResolvedValueOnce(new Map([
|
||||
['El País', { success: 2, failed: 0, duplicates: 1, items: [] }]
|
||||
]));
|
||||
|
||||
await scrapingScheduler.runScrapingCycle();
|
||||
|
||||
const stats = scrapingScheduler.getStats();
|
||||
expect(stats.totalRuns).toBe(1);
|
||||
expect(stats.successfulRuns).toBe(1);
|
||||
expect(stats.failedRuns).toBe(0);
|
||||
expect(mockContentScrapingService.scrapeFromMultipleSources).toHaveBeenCalledTimes(3);
|
||||
});
|
||||
|
||||
test.skip('should fail after max retries', async () => {
|
||||
mockContentScrapingService.scrapeFromMultipleSources
|
||||
.mockRejectedValue(new Error('Persistent failure'));
|
||||
|
||||
await scrapingScheduler.runScrapingCycle();
|
||||
|
||||
const stats = scrapingScheduler.getStats();
|
||||
expect(stats.totalRuns).toBe(1);
|
||||
expect(stats.successfulRuns).toBe(0);
|
||||
expect(stats.failedRuns).toBe(1);
|
||||
expect(mockContentScrapingService.scrapeFromMultipleSources).toHaveBeenCalledTimes(3); // 1 + 2 retries
|
||||
}, 10000);
|
||||
|
||||
test.skip('should not run concurrent cycles', async () => {
|
||||
let resolveFirst: () => void;
|
||||
const firstPromise = new Promise<void>(resolve => {
|
||||
resolveFirst = resolve;
|
||||
});
|
||||
|
||||
mockContentScrapingService.scrapeFromMultipleSources.mockImplementation(() => firstPromise.then(() => new Map()));
|
||||
|
||||
// Start first cycle
|
||||
const firstCycle = scrapingScheduler.runScrapingCycle();
|
||||
expect(scrapingScheduler.isCycleRunning()).toBe(true);
|
||||
|
||||
// Try to start second cycle while first is running
|
||||
const secondCycle = scrapingScheduler.runScrapingCycle();
|
||||
|
||||
// Resolve first cycle
|
||||
resolveFirst!();
|
||||
await firstCycle;
|
||||
await secondCycle;
|
||||
|
||||
const stats = scrapingScheduler.getStats();
|
||||
expect(stats.totalRuns).toBe(1); // Only one cycle should have run
|
||||
expect(mockContentScrapingService.scrapeFromMultipleSources).toHaveBeenCalledTimes(1);
|
||||
}, 10000);
|
||||
});
|
||||
|
||||
describe('Single Source Scraping', () => {
|
||||
test('should run single source scraping successfully', async () => {
|
||||
const mockResult = { success: 3, failed: 0, duplicates: 1, items: [] };
|
||||
mockContentScrapingService.scrapeFromSource.mockResolvedValue(mockResult);
|
||||
|
||||
await scrapingScheduler.runSingleSource('El País');
|
||||
|
||||
expect(mockContentScrapingService.scrapeFromSource).toHaveBeenCalledWith({
|
||||
name: 'El País',
|
||||
source: NewsSource.EL_PAIS,
|
||||
webUrls: ['https://elpais.com'],
|
||||
enabled: true
|
||||
});
|
||||
});
|
||||
|
||||
test('should handle unknown source name', async () => {
|
||||
await expect(scrapingScheduler.runSingleSource('Unknown Source'))
|
||||
.rejects.toThrow('Source configuration not found: Unknown Source');
|
||||
});
|
||||
|
||||
test('should handle single source scraping errors', async () => {
|
||||
mockContentScrapingService.scrapeFromSource.mockRejectedValue(new Error('Scraping failed'));
|
||||
|
||||
await expect(scrapingScheduler.runSingleSource('El País'))
|
||||
.rejects.toThrow('Scraping failed');
|
||||
});
|
||||
});
|
||||
|
||||
describe('Configuration Management', () => {
|
||||
test('should update configuration', () => {
|
||||
const newConfig = {
|
||||
intervalMinutes: 60,
|
||||
maxRetries: 5
|
||||
};
|
||||
|
||||
scrapingScheduler.updateConfig(newConfig);
|
||||
const config = scrapingScheduler.getConfig();
|
||||
|
||||
expect(config.intervalMinutes).toBe(60);
|
||||
expect(config.maxRetries).toBe(5);
|
||||
expect(config.retryDelayMinutes).toBe(1); // Should keep existing value
|
||||
expect(config.enabled).toBe(true); // Should keep existing value
|
||||
});
|
||||
|
||||
test('should restart scheduler when updating config while running', () => {
|
||||
scrapingScheduler.start();
|
||||
expect(scrapingScheduler.isSchedulerRunning()).toBe(true);
|
||||
|
||||
scrapingScheduler.updateConfig({ intervalMinutes: 60 });
|
||||
expect(scrapingScheduler.isSchedulerRunning()).toBe(true);
|
||||
expect(scrapingScheduler.getConfig().intervalMinutes).toBe(60);
|
||||
});
|
||||
|
||||
test('should not restart scheduler when updating config while stopped', () => {
|
||||
expect(scrapingScheduler.isSchedulerRunning()).toBe(false);
|
||||
|
||||
scrapingScheduler.updateConfig({ intervalMinutes: 60 });
|
||||
expect(scrapingScheduler.isSchedulerRunning()).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe('Statistics Management', () => {
|
||||
test('should reset statistics', () => {
|
||||
// Simulate some activity
|
||||
scrapingScheduler.start();
|
||||
const statsBeforeReset = scrapingScheduler.getStats();
|
||||
statsBeforeReset.totalRuns = 5;
|
||||
statsBeforeReset.successfulRuns = 3;
|
||||
statsBeforeReset.totalItemsScraped = 100;
|
||||
|
||||
scrapingScheduler.resetStats();
|
||||
const statsAfterReset = scrapingScheduler.getStats();
|
||||
|
||||
expect(statsAfterReset.totalRuns).toBe(0);
|
||||
expect(statsAfterReset.successfulRuns).toBe(0);
|
||||
expect(statsAfterReset.failedRuns).toBe(0);
|
||||
expect(statsAfterReset.totalItemsScraped).toBe(0);
|
||||
expect(statsAfterReset.totalDuplicates).toBe(0);
|
||||
expect(statsAfterReset.lastRun).toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
describe('Graceful Shutdown', () => {
|
||||
test('should shutdown gracefully when not running', async () => {
|
||||
await expect(scrapingScheduler.shutdown()).resolves.not.toThrow();
|
||||
expect(scrapingScheduler.isSchedulerRunning()).toBe(false);
|
||||
});
|
||||
|
||||
test.skip('should shutdown gracefully when running', async () => {
|
||||
scrapingScheduler.start();
|
||||
expect(scrapingScheduler.isSchedulerRunning()).toBe(true);
|
||||
|
||||
await scrapingScheduler.shutdown();
|
||||
expect(scrapingScheduler.isSchedulerRunning()).toBe(false);
|
||||
}, 10000);
|
||||
});
|
||||
});
|
231
src/__tests__/ScrapingService.test.ts
Normal file
231
src/__tests__/ScrapingService.test.ts
Normal file
@ -0,0 +1,231 @@
|
||||
import { ScrapingService } from '../services/ScrapingService';
|
||||
import { IFeedRepository } from '../repositories/FeedRepository';
|
||||
|
||||
// Mock FeedRepository
|
||||
const mockFeedRepository: jest.Mocked<IFeedRepository> = {
|
||||
create: jest.fn(),
|
||||
findAll: jest.fn(),
|
||||
findById: jest.fn(),
|
||||
findByUrl: jest.fn(),
|
||||
findBySource: jest.fn(),
|
||||
findTodaysFrontPage: jest.fn(),
|
||||
update: jest.fn(),
|
||||
delete: jest.fn(),
|
||||
deleteMany: jest.fn(),
|
||||
count: jest.fn(),
|
||||
exists: jest.fn()
|
||||
};
|
||||
|
||||
describe('ScrapingService', () => {
|
||||
let scrapingService: ScrapingService;
|
||||
|
||||
beforeEach(() => {
|
||||
jest.clearAllMocks();
|
||||
scrapingService = new ScrapingService(mockFeedRepository);
|
||||
});
|
||||
|
||||
describe('Basic Functionality', () => {
|
||||
test('should create ScrapingService instance', () => {
|
||||
expect(scrapingService).toBeInstanceOf(ScrapingService);
|
||||
});
|
||||
|
||||
test('should return service name', () => {
|
||||
const serviceName = scrapingService.getServiceName();
|
||||
expect(serviceName).toBe('ScrapingService');
|
||||
});
|
||||
|
||||
test('should have access to repository', () => {
|
||||
const hasRepository = scrapingService.hasRepository();
|
||||
expect(hasRepository).toBe(true);
|
||||
});
|
||||
|
||||
test('should get feed count from repository', async () => {
|
||||
mockFeedRepository.count.mockResolvedValue(5);
|
||||
|
||||
const count = await scrapingService.getFeedCount();
|
||||
|
||||
expect(mockFeedRepository.count).toHaveBeenCalled();
|
||||
expect(count).toBe(5);
|
||||
});
|
||||
|
||||
test('should handle repository errors when getting feed count', async () => {
|
||||
const errorMessage = 'Database connection failed';
|
||||
mockFeedRepository.count.mockRejectedValue(new Error(errorMessage));
|
||||
|
||||
await expect(scrapingService.getFeedCount()).rejects.toThrow(errorMessage);
|
||||
expect(mockFeedRepository.count).toHaveBeenCalled();
|
||||
});
|
||||
|
||||
test('should save feed item to repository', async () => {
|
||||
const feedData = {
|
||||
title: 'Test News',
|
||||
description: 'Test description',
|
||||
url: 'https://example.com/news',
|
||||
source: 'El País' as any,
|
||||
publishedAt: new Date(),
|
||||
isManual: false
|
||||
};
|
||||
|
||||
const savedFeed = { _id: '1', ...feedData };
|
||||
mockFeedRepository.create.mockResolvedValue(savedFeed);
|
||||
|
||||
const result = await scrapingService.saveFeedItem(feedData);
|
||||
|
||||
expect(mockFeedRepository.create).toHaveBeenCalledWith(feedData);
|
||||
expect(result).toEqual(savedFeed);
|
||||
});
|
||||
|
||||
test('should check if feed exists by URL', async () => {
|
||||
const testUrl = 'https://example.com/news';
|
||||
const existingFeed = {
|
||||
_id: '1',
|
||||
title: 'Existing News',
|
||||
description: 'Existing description',
|
||||
url: testUrl,
|
||||
source: 'El País' as any,
|
||||
publishedAt: new Date(),
|
||||
isManual: false
|
||||
};
|
||||
|
||||
mockFeedRepository.findByUrl.mockResolvedValue(existingFeed);
|
||||
|
||||
const exists = await scrapingService.feedExists(testUrl);
|
||||
|
||||
expect(mockFeedRepository.findByUrl).toHaveBeenCalledWith(testUrl);
|
||||
expect(exists).toBe(true);
|
||||
});
|
||||
|
||||
test('should save feed item only if it does not exist', async () => {
|
||||
const feedData = {
|
||||
title: 'New News',
|
||||
description: 'New description',
|
||||
url: 'https://example.com/new-news',
|
||||
source: 'El País' as any,
|
||||
publishedAt: new Date(),
|
||||
isManual: false
|
||||
};
|
||||
|
||||
const savedFeed = { _id: '2', ...feedData };
|
||||
mockFeedRepository.findByUrl.mockResolvedValue(null);
|
||||
mockFeedRepository.create.mockResolvedValue(savedFeed);
|
||||
|
||||
const result = await scrapingService.saveIfNotExists(feedData);
|
||||
|
||||
expect(mockFeedRepository.findByUrl).toHaveBeenCalledWith(feedData.url);
|
||||
expect(mockFeedRepository.create).toHaveBeenCalledWith(feedData);
|
||||
expect(result).toEqual(savedFeed);
|
||||
});
|
||||
|
||||
test('should return null when trying to save existing feed', async () => {
|
||||
const feedData = {
|
||||
title: 'Existing News',
|
||||
description: 'Existing description',
|
||||
url: 'https://example.com/existing-news',
|
||||
source: 'El País' as any,
|
||||
publishedAt: new Date(),
|
||||
isManual: false
|
||||
};
|
||||
|
||||
const existingFeed = { _id: '1', ...feedData };
|
||||
mockFeedRepository.findByUrl.mockResolvedValue(existingFeed);
|
||||
|
||||
const result = await scrapingService.saveIfNotExists(feedData);
|
||||
|
||||
expect(mockFeedRepository.findByUrl).toHaveBeenCalledWith(feedData.url);
|
||||
expect(mockFeedRepository.create).not.toHaveBeenCalled();
|
||||
expect(result).toBeNull();
|
||||
});
|
||||
|
||||
test('should process multiple feed items and return results', async () => {
|
||||
const feedItems = [
|
||||
{
|
||||
title: 'News 1',
|
||||
description: 'Description 1',
|
||||
url: 'https://example.com/news1',
|
||||
source: 'El País' as any,
|
||||
publishedAt: new Date(),
|
||||
isManual: false
|
||||
},
|
||||
{
|
||||
title: 'News 2',
|
||||
description: 'Description 2',
|
||||
url: 'https://example.com/news2',
|
||||
source: 'El País' as any,
|
||||
publishedAt: new Date(),
|
||||
isManual: false
|
||||
}
|
||||
];
|
||||
|
||||
const savedFeeds = [
|
||||
{ _id: '1', ...feedItems[0] },
|
||||
{ _id: '2', ...feedItems[1] }
|
||||
];
|
||||
|
||||
mockFeedRepository.findByUrl.mockResolvedValue(null);
|
||||
mockFeedRepository.create.mockResolvedValueOnce(savedFeeds[0]).mockResolvedValueOnce(savedFeeds[1]);
|
||||
|
||||
const results = await scrapingService.processFeedBatch(feedItems);
|
||||
|
||||
expect(mockFeedRepository.findByUrl).toHaveBeenCalledTimes(2);
|
||||
expect(mockFeedRepository.create).toHaveBeenCalledTimes(2);
|
||||
expect(results).toHaveLength(2);
|
||||
expect(results[0]).toEqual(savedFeeds[0]);
|
||||
expect(results[1]).toEqual(savedFeeds[1]);
|
||||
});
|
||||
|
||||
test('should handle errors during batch processing', async () => {
|
||||
const feedItems = [
|
||||
{
|
||||
title: 'News 1',
|
||||
description: 'Description 1',
|
||||
url: 'https://example.com/news1',
|
||||
source: 'El País' as any,
|
||||
publishedAt: new Date(),
|
||||
isManual: false
|
||||
}
|
||||
];
|
||||
|
||||
mockFeedRepository.findByUrl.mockRejectedValue(new Error('Database connection failed'));
|
||||
|
||||
await expect(scrapingService.processFeedBatch(feedItems)).rejects.toThrow('Database connection failed');
|
||||
expect(mockFeedRepository.findByUrl).toHaveBeenCalledWith(feedItems[0].url);
|
||||
});
|
||||
|
||||
test('should handle mixed results in batch processing', async () => {
|
||||
const feedItems = [
|
||||
{
|
||||
title: 'New News',
|
||||
description: 'New description',
|
||||
url: 'https://example.com/new-news',
|
||||
source: 'El País' as any,
|
||||
publishedAt: new Date(),
|
||||
isManual: false
|
||||
},
|
||||
{
|
||||
title: 'Existing News',
|
||||
description: 'Existing description',
|
||||
url: 'https://example.com/existing-news',
|
||||
source: 'El País' as any,
|
||||
publishedAt: new Date(),
|
||||
isManual: false
|
||||
}
|
||||
];
|
||||
|
||||
const savedFeed = { _id: '1', ...feedItems[0] };
|
||||
const existingFeed = { _id: '2', ...feedItems[1] };
|
||||
|
||||
mockFeedRepository.findByUrl
|
||||
.mockResolvedValueOnce(null)
|
||||
.mockResolvedValueOnce(existingFeed);
|
||||
mockFeedRepository.create.mockResolvedValue(savedFeed);
|
||||
|
||||
const results = await scrapingService.processFeedBatch(feedItems);
|
||||
|
||||
expect(mockFeedRepository.findByUrl).toHaveBeenCalledTimes(2);
|
||||
expect(mockFeedRepository.create).toHaveBeenCalledTimes(1);
|
||||
expect(results).toHaveLength(2);
|
||||
expect(results[0]).toEqual(savedFeed);
|
||||
expect(results[1]).toBeNull();
|
||||
});
|
||||
});
|
||||
});
|
210
src/__tests__/WebScraper.test.ts
Normal file
210
src/__tests__/WebScraper.test.ts
Normal file
@ -0,0 +1,210 @@
|
||||
import { WebScraper } from '../utils/WebScraper';
|
||||
import { NewsSource } from '../types/Feed';
|
||||
import { Logger } from '../utils/logger';
|
||||
|
||||
// Mock the Logger
|
||||
jest.mock('../utils/logger', () => ({
|
||||
Logger: {
|
||||
error: jest.fn(),
|
||||
warn: jest.fn(),
|
||||
info: jest.fn(),
|
||||
debug: jest.fn()
|
||||
}
|
||||
}));
|
||||
|
||||
// Mock fetch
|
||||
global.fetch = jest.fn();
|
||||
|
||||
describe('WebScraper', () => {
|
||||
let webScraper: WebScraper;
|
||||
const mockFetch = fetch as jest.MockedFunction<typeof fetch>;
|
||||
|
||||
beforeEach(() => {
|
||||
webScraper = new WebScraper();
|
||||
jest.clearAllMocks();
|
||||
});
|
||||
|
||||
describe('scrapeUrl', () => {
|
||||
test('should successfully scrape a URL with complete metadata', async () => {
|
||||
const mockHtml = `
|
||||
<html>
|
||||
<head>
|
||||
<title>Test News Article</title>
|
||||
<meta property="og:title" content="Test News Article">
|
||||
<meta property="og:description" content="This is a test news article description">
|
||||
<meta property="article:published_time" content="2024-01-15T10:30:00Z">
|
||||
</head>
|
||||
<body>
|
||||
<h1>Test News Article</h1>
|
||||
<p>Article content here...</p>
|
||||
</body>
|
||||
</html>
|
||||
`;
|
||||
|
||||
mockFetch.mockResolvedValue({
|
||||
ok: true,
|
||||
text: () => Promise.resolve(mockHtml)
|
||||
} as Response);
|
||||
|
||||
const result = await webScraper.scrapeUrl('https://example.com/news');
|
||||
|
||||
expect(result).toEqual({
|
||||
title: 'Test News Article',
|
||||
description: 'This is a test news article description',
|
||||
url: 'https://example.com/news',
|
||||
publishedAt: new Date('2024-01-15T10:30:00Z')
|
||||
});
|
||||
|
||||
expect(mockFetch).toHaveBeenCalledWith('https://example.com/news', {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; DailyTrends/1.0)',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
test('should handle HTTP errors gracefully', async () => {
|
||||
mockFetch.mockResolvedValue({
|
||||
ok: false,
|
||||
status: 404,
|
||||
statusText: 'Not Found'
|
||||
} as Response);
|
||||
|
||||
const result = await webScraper.scrapeUrl('https://example.com/not-found');
|
||||
|
||||
expect(result).toBeNull();
|
||||
expect(Logger.error).toHaveBeenCalledWith(
|
||||
'Failed to fetch https://example.com/not-found: 404 Not Found'
|
||||
);
|
||||
});
|
||||
|
||||
test('should handle network errors gracefully', async () => {
|
||||
mockFetch.mockRejectedValue(new Error('Network error'));
|
||||
|
||||
const result = await webScraper.scrapeUrl('https://example.com/error');
|
||||
|
||||
expect(result).toBeNull();
|
||||
expect(Logger.error).toHaveBeenCalledWith(
|
||||
'Error scraping https://example.com/error:',
|
||||
expect.any(Error)
|
||||
);
|
||||
});
|
||||
|
||||
test('should return null when no title is found', async () => {
|
||||
const mockHtml = `
|
||||
<html>
|
||||
<head>
|
||||
<meta property="og:description" content="Description without title">
|
||||
</head>
|
||||
<body>
|
||||
<p>Content without title</p>
|
||||
</body>
|
||||
</html>
|
||||
`;
|
||||
|
||||
mockFetch.mockResolvedValue({
|
||||
ok: true,
|
||||
text: () => Promise.resolve(mockHtml)
|
||||
} as Response);
|
||||
|
||||
const result = await webScraper.scrapeUrl('https://example.com/no-title');
|
||||
|
||||
expect(result).toBeNull();
|
||||
expect(Logger.warn).toHaveBeenCalledWith('No title found for https://example.com/no-title');
|
||||
});
|
||||
|
||||
test('should return null when no description is found', async () => {
|
||||
const mockHtml = `
|
||||
<html>
|
||||
<head>
|
||||
<title>Title Only</title>
|
||||
</head>
|
||||
<body>
|
||||
<p>Content without description meta</p>
|
||||
</body>
|
||||
</html>
|
||||
`;
|
||||
|
||||
mockFetch.mockResolvedValue({
|
||||
ok: true,
|
||||
text: () => Promise.resolve(mockHtml)
|
||||
} as Response);
|
||||
|
||||
const result = await webScraper.scrapeUrl('https://example.com/no-description');
|
||||
|
||||
expect(result).toBeNull();
|
||||
expect(Logger.warn).toHaveBeenCalledWith('No description found for https://example.com/no-description');
|
||||
});
|
||||
|
||||
test('should use current date when no published date is found', async () => {
|
||||
const mockHtml = `
|
||||
<html>
|
||||
<head>
|
||||
<title>Test Article</title>
|
||||
<meta property="og:description" content="Test description">
|
||||
</head>
|
||||
</html>
|
||||
`;
|
||||
|
||||
mockFetch.mockResolvedValue({
|
||||
ok: true,
|
||||
text: () => Promise.resolve(mockHtml)
|
||||
} as Response);
|
||||
|
||||
const beforeScrape = new Date();
|
||||
const result = await webScraper.scrapeUrl('https://example.com/no-date');
|
||||
const afterScrape = new Date();
|
||||
|
||||
expect(result).not.toBeNull();
|
||||
expect(result!.publishedAt.getTime()).toBeGreaterThanOrEqual(beforeScrape.getTime());
|
||||
expect(result!.publishedAt.getTime()).toBeLessThanOrEqual(afterScrape.getTime());
|
||||
});
|
||||
});
|
||||
|
||||
describe('convertToFeedData', () => {
|
||||
test('should convert scraped data to feed format', () => {
|
||||
const scrapedData = {
|
||||
title: 'Test News',
|
||||
description: 'Test description',
|
||||
url: 'https://example.com/news',
|
||||
publishedAt: new Date('2024-01-15T10:00:00Z')
|
||||
};
|
||||
|
||||
const feedData = webScraper.convertToFeedData(scrapedData, NewsSource.EL_PAIS);
|
||||
|
||||
expect(feedData).toEqual({
|
||||
title: 'Test News',
|
||||
description: 'Test description',
|
||||
url: 'https://example.com/news',
|
||||
source: NewsSource.EL_PAIS,
|
||||
publishedAt: new Date('2024-01-15T10:00:00Z'),
|
||||
isManual: false
|
||||
});
|
||||
});
|
||||
|
||||
test('should handle HTML with special characters and entities', async () => {
|
||||
const htmlWithEntities = `
|
||||
<html>
|
||||
<head>
|
||||
<title>News & Updates - El País</title>
|
||||
<meta name="description" content="Breaking news "today" & analysis">
|
||||
</head>
|
||||
</html>
|
||||
`;
|
||||
|
||||
global.fetch = jest.fn().mockResolvedValue({
|
||||
ok: true,
|
||||
text: () => Promise.resolve(htmlWithEntities)
|
||||
});
|
||||
|
||||
const result = await webScraper.scrapeUrl('https://example.com/news');
|
||||
|
||||
expect(result).toEqual({
|
||||
title: 'News & Updates - El País',
|
||||
description: 'Breaking news "today" & analysis',
|
||||
url: 'https://example.com/news',
|
||||
publishedAt: expect.any(Date)
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
@ -2,6 +2,11 @@ export interface IConfig {
|
||||
port: number;
|
||||
mongodbUri: string;
|
||||
nodeEnv: string;
|
||||
apiVersion: string;
|
||||
rateLimitWindowMs: number;
|
||||
rateLimitMaxRequests: number;
|
||||
requestTimeoutMs: number;
|
||||
userAgent: string;
|
||||
}
|
||||
|
||||
class Config implements IConfig {
|
||||
@ -10,12 +15,21 @@ class Config implements IConfig {
|
||||
public readonly port: number;
|
||||
public readonly mongodbUri: string;
|
||||
public readonly nodeEnv: string;
|
||||
|
||||
public readonly apiVersion: string;
|
||||
public readonly rateLimitWindowMs: number;
|
||||
public readonly rateLimitMaxRequests: number;
|
||||
public readonly requestTimeoutMs: number;
|
||||
public readonly userAgent: string;
|
||||
|
||||
private constructor() {
|
||||
this.port = parseInt(process.env.PORT || '4000', 10);
|
||||
this.port = parseInt(process.env.PORT || '3000', 10);
|
||||
this.mongodbUri = process.env.MONGODB_URI || 'mongodb://localhost:27017/dailytrends';
|
||||
this.nodeEnv = process.env.NODE_ENV || 'development';
|
||||
this.apiVersion = process.env.API_VERSION || 'v1';
|
||||
this.rateLimitWindowMs = parseInt(process.env.RATE_LIMIT_WINDOW_MS || '900000', 10);
|
||||
this.rateLimitMaxRequests = parseInt(process.env.RATE_LIMIT_MAX_REQUESTS || '100', 10);
|
||||
this.requestTimeoutMs = parseInt(process.env.REQUEST_TIMEOUT_MS || '10000', 10);
|
||||
this.userAgent = process.env.USER_AGENT || 'DailyTrends-Bot/1.0';
|
||||
|
||||
this.validateConfig();
|
||||
}
|
||||
@ -31,6 +45,22 @@ class Config implements IConfig {
|
||||
if (!this.mongodbUri) {
|
||||
throw new Error('MONGODB_URI is required');
|
||||
}
|
||||
|
||||
if (this.port < 1 || this.port > 65535) {
|
||||
throw new Error('PORT must be between 1 and 65535');
|
||||
}
|
||||
|
||||
if (this.rateLimitWindowMs < 1000) {
|
||||
throw new Error('RATE_LIMIT_WINDOW_MS must be at least 1000ms');
|
||||
}
|
||||
|
||||
if (this.rateLimitMaxRequests < 1) {
|
||||
throw new Error('RATE_LIMIT_MAX_REQUESTS must be at least 1');
|
||||
}
|
||||
|
||||
if (this.requestTimeoutMs < 1000) {
|
||||
throw new Error('REQUEST_TIMEOUT_MS must be at least 1000ms');
|
||||
}
|
||||
}
|
||||
|
||||
public isDevelopment(): boolean {
|
||||
|
78
src/extractors/BaseNewspaperExtractor.ts
Normal file
78
src/extractors/BaseNewspaperExtractor.ts
Normal file
@ -0,0 +1,78 @@
|
||||
import { WebScraper } from '../utils/WebScraper';
|
||||
import { IFeed, NewsSource } from '../types/Feed';
|
||||
import { NewspaperConfig } from '../types/NewspaperTypes';
|
||||
import { Logger } from '../utils/logger';
|
||||
|
||||
/**
|
||||
* Clase abstracta base para extractores de periódicos
|
||||
*/
|
||||
export abstract class BaseNewspaperExtractor {
|
||||
protected webScraper: WebScraper;
|
||||
protected config: NewspaperConfig;
|
||||
|
||||
constructor(config: NewspaperConfig) {
|
||||
this.webScraper = new WebScraper();
|
||||
this.config = config;
|
||||
}
|
||||
|
||||
/**
|
||||
* Método abstracto que debe implementar cada extractor específico
|
||||
*/
|
||||
abstract extractFrontPageUrls(): Promise<string[]>;
|
||||
|
||||
/**
|
||||
* Extrae noticias de las URLs de portada
|
||||
*/
|
||||
async extractNews(): Promise<Omit<IFeed, '_id' | 'createdAt' | 'updatedAt'>[]> {
|
||||
try {
|
||||
Logger.info(`Extracting front page URLs for ${this.config.name}`);
|
||||
const urls = await this.extractFrontPageUrls();
|
||||
|
||||
if (urls.length === 0) {
|
||||
Logger.warn(`No URLs found for ${this.config.name}`);
|
||||
return [];
|
||||
}
|
||||
|
||||
Logger.info(`Found ${urls.length} articles for ${this.config.name}`);
|
||||
const newsItems: Omit<IFeed, '_id' | 'createdAt' | 'updatedAt'>[] = [];
|
||||
|
||||
for (const url of urls) {
|
||||
try {
|
||||
const scrapedData = await this.webScraper.scrapeUrl(url);
|
||||
if (scrapedData) {
|
||||
const feedItem = this.webScraper.convertToFeedData(scrapedData, this.config.source);
|
||||
newsItems.push(feedItem);
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(`Error scraping article ${url}:`, error);
|
||||
}
|
||||
}
|
||||
|
||||
return newsItems;
|
||||
} catch (error) {
|
||||
Logger.error(`Error extracting news for ${this.config.name}:`, error);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifica si el extractor está habilitado
|
||||
*/
|
||||
isEnabled(): boolean {
|
||||
return this.config.enabled;
|
||||
}
|
||||
|
||||
/**
|
||||
* Obtiene el nombre del periódico
|
||||
*/
|
||||
getName(): string {
|
||||
return this.config.name;
|
||||
}
|
||||
|
||||
/**
|
||||
* Obtiene la fuente del periódico
|
||||
*/
|
||||
getSource(): NewsSource {
|
||||
return this.config.source;
|
||||
}
|
||||
}
|
78
src/extractors/ElMundoExtractor.ts
Normal file
78
src/extractors/ElMundoExtractor.ts
Normal file
@ -0,0 +1,78 @@
|
||||
import { BaseNewspaperExtractor } from './BaseNewspaperExtractor';
|
||||
import { NewsSource } from '../types/Feed';
|
||||
import { Logger } from '../utils/logger';
|
||||
|
||||
/**
|
||||
* Extractor específico para El Mundo
|
||||
*/
|
||||
export class ElMundoExtractor extends BaseNewspaperExtractor {
|
||||
constructor() {
|
||||
super({
|
||||
name: 'El Mundo',
|
||||
source: NewsSource.EL_MUNDO,
|
||||
baseUrl: 'https://elmundo.es',
|
||||
frontPageUrl: 'https://elmundo.es',
|
||||
selectors: {
|
||||
articleLinks: '.ue-c-cover-content__link, .ue-c-cover-content__headline-link, h2 a, h3 a',
|
||||
titleSelector: 'h1, .ue-c-article__headline',
|
||||
descriptionSelector: '.ue-c-article__standfirst, .ue-c-cover-content__standfirst',
|
||||
dateSelector: '.ue-c-article__publishdate, time',
|
||||
imageSelector: '.ue-c-article__image img'
|
||||
},
|
||||
enabled: true
|
||||
});
|
||||
}
|
||||
|
||||
async extractFrontPageUrls(): Promise<string[]> {
|
||||
// Obtener HTML directamente usando fetch
|
||||
const response = await fetch(this.config.frontPageUrl, {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; DailyTrends/1.0)',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
|
||||
}
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
Logger.error(`Failed to fetch ${this.config.frontPageUrl}: ${response.status}`);
|
||||
return [];
|
||||
}
|
||||
|
||||
const html = await response.text();
|
||||
if (!html) {
|
||||
return [];
|
||||
}
|
||||
|
||||
try {
|
||||
// Extraer enlaces de artículos usando regex
|
||||
const linkRegex = /<a[^>]+href=["']([^"']*(?:elmundo\.es)?[^"']*)["'][^>]*>.*?<\/a>/gi;
|
||||
const urls: string[] = [];
|
||||
let match;
|
||||
|
||||
while ((match = linkRegex.exec(html)) !== null) {
|
||||
let url = match[1];
|
||||
|
||||
// Filtrar solo URLs de artículos relevantes
|
||||
if (url.includes('/espana/') ||
|
||||
url.includes('/internacional/') ||
|
||||
url.includes('/economia/') ||
|
||||
url.includes('/sociedad/') ||
|
||||
url.includes('/politica/')) {
|
||||
|
||||
// Convertir URLs relativas a absolutas
|
||||
if (url.startsWith('/')) {
|
||||
url = this.config.baseUrl + url;
|
||||
}
|
||||
|
||||
if (!urls.includes(url) && urls.length < 20) {
|
||||
urls.push(url);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return urls;
|
||||
} catch (error) {
|
||||
Logger.error(`Error extracting El Mundo URLs:`, error);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
}
|
78
src/extractors/ElPaisExtractor.ts
Normal file
78
src/extractors/ElPaisExtractor.ts
Normal file
@ -0,0 +1,78 @@
|
||||
import { BaseNewspaperExtractor } from './BaseNewspaperExtractor';
|
||||
import { NewsSource } from '../types/Feed';
|
||||
import { Logger } from '../utils/logger';
|
||||
|
||||
/**
|
||||
* Extractor específico para El País
|
||||
*/
|
||||
export class ElPaisExtractor extends BaseNewspaperExtractor {
|
||||
constructor() {
|
||||
super({
|
||||
name: 'El País',
|
||||
source: NewsSource.EL_PAIS,
|
||||
baseUrl: 'https://elpais.com',
|
||||
frontPageUrl: 'https://elpais.com',
|
||||
selectors: {
|
||||
articleLinks: 'article h2 a, .c_t a, .articulo-titulo a, h2.articulo-titulo a',
|
||||
titleSelector: 'h1, .articulo-titulo',
|
||||
descriptionSelector: '.articulo-entradilla, .entradilla, .subtitulo',
|
||||
dateSelector: '.articulo-fecha, time',
|
||||
imageSelector: '.articulo-foto img, .foto img'
|
||||
},
|
||||
enabled: true
|
||||
});
|
||||
}
|
||||
|
||||
async extractFrontPageUrls(): Promise<string[]> {
|
||||
// Obtener HTML directamente usando fetch
|
||||
const response = await fetch(this.config.frontPageUrl, {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; DailyTrends/1.0)',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
|
||||
}
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
Logger.error(`Failed to fetch ${this.config.frontPageUrl}: ${response.status}`);
|
||||
return [];
|
||||
}
|
||||
|
||||
const html = await response.text();
|
||||
if (!html) {
|
||||
return [];
|
||||
}
|
||||
|
||||
try {
|
||||
// Extraer enlaces de artículos usando regex
|
||||
const linkRegex = /<a[^>]+href=["']([^"']*(?:elpais\.com)?[^"']*)["'][^>]*>.*?<\/a>/gi;
|
||||
const urls: string[] = [];
|
||||
let match;
|
||||
|
||||
while ((match = linkRegex.exec(html)) !== null) {
|
||||
let url = match[1];
|
||||
|
||||
// Filtrar solo URLs de artículos relevantes
|
||||
if (url.includes('/politica/') ||
|
||||
url.includes('/economia/') ||
|
||||
url.includes('/sociedad/') ||
|
||||
url.includes('/internacional/') ||
|
||||
url.includes('/espana/')) {
|
||||
|
||||
// Convertir URLs relativas a absolutas
|
||||
if (url.startsWith('/')) {
|
||||
url = this.config.baseUrl + url;
|
||||
}
|
||||
|
||||
if (!urls.includes(url) && urls.length < 20) {
|
||||
urls.push(url);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return urls;
|
||||
} catch (error) {
|
||||
Logger.error(`Error extracting El País URLs:`, error);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
}
|
37
src/extractors/NewspaperExtractorFactory.ts
Normal file
37
src/extractors/NewspaperExtractorFactory.ts
Normal file
@ -0,0 +1,37 @@
|
||||
import { BaseNewspaperExtractor } from './BaseNewspaperExtractor';
|
||||
import { ElPaisExtractor } from './ElPaisExtractor';
|
||||
import { ElMundoExtractor } from './ElMundoExtractor';
|
||||
import { NewsSource } from '../types/Feed';
|
||||
import { Logger } from '../utils/logger';
|
||||
|
||||
/**
|
||||
* Factory para crear extractores de periódicos
|
||||
*/
|
||||
export class NewspaperExtractorFactory {
|
||||
static createExtractor(source: NewsSource): BaseNewspaperExtractor | null {
|
||||
switch (source) {
|
||||
case NewsSource.EL_PAIS:
|
||||
return new ElPaisExtractor();
|
||||
case NewsSource.EL_MUNDO:
|
||||
return new ElMundoExtractor();
|
||||
default:
|
||||
Logger.warn(`No extractor available for source: ${source}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
static getAllAvailableExtractors(): BaseNewspaperExtractor[] {
|
||||
const extractors: BaseNewspaperExtractor[] = [];
|
||||
|
||||
for (const source of Object.values(NewsSource)) {
|
||||
if (source !== NewsSource.MANUAL) {
|
||||
const extractor = this.createExtractor(source);
|
||||
if (extractor) {
|
||||
extractors.push(extractor);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return extractors;
|
||||
}
|
||||
}
|
61
src/scraper.ts
Normal file
61
src/scraper.ts
Normal file
@ -0,0 +1,61 @@
|
||||
import { ScrapingScheduler } from './services/ScrapingScheduler.js';
|
||||
import { FeedRepository } from './repositories/FeedRepository.js';
|
||||
import { DatabaseConnection } from './config/database.js';
|
||||
import { Logger } from './utils/logger.js';
|
||||
|
||||
let scheduler: ScrapingScheduler;
|
||||
|
||||
async function initializeScraper() {
|
||||
try {
|
||||
// Connect to database
|
||||
await DatabaseConnection.getInstance().connect();
|
||||
Logger.database.connected();
|
||||
|
||||
// Initialize repository and scheduler
|
||||
const feedRepository = new FeedRepository();
|
||||
scheduler = new ScrapingScheduler(feedRepository, {
|
||||
intervalMinutes: 30, // Run every 30 minutes
|
||||
maxRetries: 2,
|
||||
retryDelayMinutes: 5,
|
||||
enabled: true
|
||||
});
|
||||
|
||||
// Start the scheduler
|
||||
scheduler.start();
|
||||
Logger.info('Scraping scheduler started successfully');
|
||||
|
||||
// Log initial stats
|
||||
const stats = scheduler.getStats();
|
||||
Logger.info('Initial scheduler stats', stats);
|
||||
|
||||
} catch (error) {
|
||||
Logger.error('Failed to start scraper', { error });
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
const shutdown = async () => {
|
||||
try {
|
||||
if (scheduler) {
|
||||
await scheduler.shutdown();
|
||||
Logger.info('Scraping scheduler stopped');
|
||||
}
|
||||
|
||||
await DatabaseConnection.getInstance().disconnect();
|
||||
Logger.database.disconnected();
|
||||
process.exit(0);
|
||||
} catch (error) {
|
||||
Logger.error('Error during scraper shutdown', { error });
|
||||
process.exit(1);
|
||||
}
|
||||
};
|
||||
|
||||
// Handle graceful shutdown
|
||||
process.on('SIGINT', shutdown);
|
||||
process.on('SIGTERM', shutdown);
|
||||
|
||||
// Start the scraper
|
||||
initializeScraper().catch(error => {
|
||||
Logger.error('Failed to initialize scraper', { error });
|
||||
process.exit(1);
|
||||
});
|
156
src/services/ContentScrapingService.ts
Normal file
156
src/services/ContentScrapingService.ts
Normal file
@ -0,0 +1,156 @@
|
||||
import { WebScraper } from '../utils/WebScraper.js';
|
||||
import { ScrapingService } from './ScrapingService.js';
|
||||
import { IFeed, NewsSource } from '../types/Feed.js';
|
||||
import { IFeedRepository } from '../repositories/FeedRepository.js';
|
||||
import { Logger } from '../utils/logger.js';
|
||||
|
||||
interface ScrapingResult {
|
||||
success: number;
|
||||
failed: number;
|
||||
duplicates: number;
|
||||
items: (IFeed | null)[];
|
||||
}
|
||||
|
||||
interface NewsSourceConfig {
|
||||
name: string;
|
||||
source: NewsSource;
|
||||
webUrls?: string[];
|
||||
enabled: boolean;
|
||||
}
|
||||
|
||||
export class ContentScrapingService {
|
||||
private webScraper: WebScraper;
|
||||
private scrapingService: ScrapingService;
|
||||
|
||||
constructor(feedRepository: IFeedRepository) {
|
||||
this.webScraper = new WebScraper();
|
||||
this.scrapingService = new ScrapingService(feedRepository);
|
||||
}
|
||||
|
||||
|
||||
|
||||
async scrapeFromWebUrls(urls: string[], source: NewsSource): Promise<ScrapingResult> {
|
||||
Logger.info(`Starting web scraping from ${urls.length} URLs for ${source}`);
|
||||
|
||||
const feedItems: Omit<IFeed, '_id' | 'createdAt' | 'updatedAt'>[] = [];
|
||||
|
||||
for (const url of urls) {
|
||||
try {
|
||||
const scrapedData = await this.webScraper.scrapeUrl(url);
|
||||
if (scrapedData) {
|
||||
const feedData = this.webScraper.convertToFeedData(scrapedData, source);
|
||||
feedItems.push(feedData);
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(`Error scraping URL ${url}:`, error);
|
||||
}
|
||||
}
|
||||
|
||||
if (feedItems.length === 0) {
|
||||
Logger.warn(`No items scraped from web URLs`);
|
||||
return { success: 0, failed: urls.length, duplicates: 0, items: [] };
|
||||
}
|
||||
|
||||
const results = await this.scrapingService.processFeedBatch(feedItems);
|
||||
return this.analyzeResults(results);
|
||||
}
|
||||
|
||||
async scrapeFromSource(config: NewsSourceConfig): Promise<ScrapingResult> {
|
||||
if (!config.enabled) {
|
||||
Logger.info(`Skipping disabled source: ${config.name}`);
|
||||
return { success: 0, failed: 0, duplicates: 0, items: [] };
|
||||
}
|
||||
|
||||
Logger.info(`Starting content scraping for source: ${config.name}`);
|
||||
|
||||
let totalResult: ScrapingResult = { success: 0, failed: 0, duplicates: 0, items: [] };
|
||||
|
||||
// Scrape from web URLs if available
|
||||
if (config.webUrls && config.webUrls.length > 0) {
|
||||
const webResult = await this.scrapeFromWebUrls(config.webUrls, config.source);
|
||||
totalResult = this.mergeResults(totalResult, webResult);
|
||||
}
|
||||
|
||||
Logger.info(`Completed scraping for ${config.name}: ${totalResult.success} success, ${totalResult.failed} failed, ${totalResult.duplicates} duplicates`);
|
||||
return totalResult;
|
||||
}
|
||||
|
||||
async scrapeFromMultipleSources(configs: NewsSourceConfig[]): Promise<Map<string, ScrapingResult>> {
|
||||
Logger.info(`Starting batch scraping from ${configs.length} sources`);
|
||||
|
||||
const results = new Map<string, ScrapingResult>();
|
||||
|
||||
for (const config of configs) {
|
||||
try {
|
||||
const result = await this.scrapeFromSource(config);
|
||||
results.set(config.name, result);
|
||||
} catch (error) {
|
||||
Logger.error(`Error scraping source ${config.name}:`, error);
|
||||
results.set(config.name, { success: 0, failed: 1, duplicates: 0, items: [] });
|
||||
}
|
||||
}
|
||||
|
||||
const totalStats = this.calculateTotalStats(results);
|
||||
Logger.info(`Batch scraping completed: ${totalStats.success} total success, ${totalStats.failed} total failed, ${totalStats.duplicates} total duplicates`);
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
private analyzeResults(results: (IFeed | null)[]): ScrapingResult {
|
||||
const success = results.filter(item => item !== null).length;
|
||||
const duplicates = results.filter(item => item === null).length;
|
||||
|
||||
return {
|
||||
success,
|
||||
failed: 0, // processFeedBatch doesn't fail individual items, it throws on repository errors
|
||||
duplicates,
|
||||
items: results
|
||||
};
|
||||
}
|
||||
|
||||
private mergeResults(result1: ScrapingResult, result2: ScrapingResult): ScrapingResult {
|
||||
return {
|
||||
success: result1.success + result2.success,
|
||||
failed: result1.failed + result2.failed,
|
||||
duplicates: result1.duplicates + result2.duplicates,
|
||||
items: [...result1.items, ...result2.items]
|
||||
};
|
||||
}
|
||||
|
||||
private calculateTotalStats(results: Map<string, ScrapingResult>): ScrapingResult {
|
||||
let totalSuccess = 0;
|
||||
let totalFailed = 0;
|
||||
let totalDuplicates = 0;
|
||||
const allItems: (IFeed | null)[] = [];
|
||||
|
||||
for (const result of results.values()) {
|
||||
totalSuccess += result.success;
|
||||
totalFailed += result.failed;
|
||||
totalDuplicates += result.duplicates;
|
||||
allItems.push(...result.items);
|
||||
}
|
||||
|
||||
return {
|
||||
success: totalSuccess,
|
||||
failed: totalFailed,
|
||||
duplicates: totalDuplicates,
|
||||
items: allItems
|
||||
};
|
||||
}
|
||||
|
||||
// Utility method to create common news source configurations
|
||||
static createNewsSourceConfigs(): NewsSourceConfig[] {
|
||||
return [
|
||||
{
|
||||
name: 'El País',
|
||||
source: NewsSource.EL_PAIS,
|
||||
enabled: true
|
||||
},
|
||||
{
|
||||
name: 'El Mundo',
|
||||
source: NewsSource.EL_MUNDO,
|
||||
enabled: true
|
||||
}
|
||||
];
|
||||
}
|
||||
}
|
193
src/services/FeedReaderService.ts
Normal file
193
src/services/FeedReaderService.ts
Normal file
@ -0,0 +1,193 @@
|
||||
import { ScrapingService } from './ScrapingService';
|
||||
import { IFeed, NewsSource } from '../types/Feed';
|
||||
import { IFeedRepository } from '../repositories/FeedRepository';
|
||||
import { Logger } from '../utils/logger';
|
||||
import { BaseNewspaperExtractor } from '../extractors/BaseNewspaperExtractor';
|
||||
import { NewspaperExtractorFactory } from '../extractors/NewspaperExtractorFactory';
|
||||
import { ScrapingResult } from '../types/NewspaperTypes';
|
||||
|
||||
/**
|
||||
* Servicio principal de lectura de feeds mediante web scraping
|
||||
*/
|
||||
export class FeedReaderService {
|
||||
private scrapingService: ScrapingService;
|
||||
private extractors: Map<NewsSource, BaseNewspaperExtractor>;
|
||||
|
||||
constructor(feedRepository: IFeedRepository) {
|
||||
this.scrapingService = new ScrapingService(feedRepository);
|
||||
this.extractors = new Map();
|
||||
this.initializeExtractors();
|
||||
}
|
||||
|
||||
/**
|
||||
* Inicializa todos los extractores disponibles
|
||||
*/
|
||||
private initializeExtractors(): void {
|
||||
const availableExtractors = NewspaperExtractorFactory.getAllAvailableExtractors();
|
||||
|
||||
for (const extractor of availableExtractors) {
|
||||
this.extractors.set(extractor.getSource(), extractor);
|
||||
Logger.info(`Initialized extractor for ${extractor.getName()}`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extrae noticias de un periódico específico
|
||||
*/
|
||||
async extractFromNewspaper(source: NewsSource): Promise<ScrapingResult> {
|
||||
const extractor = this.extractors.get(source);
|
||||
|
||||
if (!extractor) {
|
||||
const error = `No extractor found for source: ${source}`;
|
||||
Logger.error(error);
|
||||
return {
|
||||
success: 0,
|
||||
failed: 1,
|
||||
duplicates: 0,
|
||||
items: [],
|
||||
errors: [error]
|
||||
};
|
||||
}
|
||||
|
||||
if (!extractor.isEnabled()) {
|
||||
Logger.info(`Skipping disabled extractor: ${extractor.getName()}`);
|
||||
return {
|
||||
success: 0,
|
||||
failed: 0,
|
||||
duplicates: 0,
|
||||
items: [],
|
||||
errors: []
|
||||
};
|
||||
}
|
||||
|
||||
try {
|
||||
Logger.info(`Starting extraction for ${extractor.getName()}`);
|
||||
const newsItems = await extractor.extractNews();
|
||||
|
||||
if (newsItems.length === 0) {
|
||||
Logger.warn(`No news items extracted for ${extractor.getName()}`);
|
||||
return {
|
||||
success: 0,
|
||||
failed: 0,
|
||||
duplicates: 0,
|
||||
items: [],
|
||||
errors: []
|
||||
};
|
||||
}
|
||||
|
||||
const results = await this.scrapingService.processFeedBatch(newsItems);
|
||||
const analyzed = this.analyzeResults(results);
|
||||
|
||||
Logger.info(`Completed extraction for ${extractor.getName()}: ${analyzed.success} success, ${analyzed.failed} failed, ${analyzed.duplicates} duplicates`);
|
||||
return analyzed;
|
||||
} catch (error) {
|
||||
const errorMsg = `Error extracting from ${extractor.getName()}: ${error}`;
|
||||
Logger.error(errorMsg);
|
||||
return {
|
||||
success: 0,
|
||||
failed: 1,
|
||||
duplicates: 0,
|
||||
items: [],
|
||||
errors: [errorMsg]
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extrae noticias de todos los periódicos disponibles
|
||||
*/
|
||||
async extractFromAllNewspapers(): Promise<Map<NewsSource, ScrapingResult>> {
|
||||
Logger.info(`Starting batch extraction from ${this.extractors.size} newspapers`);
|
||||
const results = new Map<NewsSource, ScrapingResult>();
|
||||
|
||||
for (const [source, extractor] of this.extractors) {
|
||||
if (extractor.isEnabled()) {
|
||||
const result = await this.extractFromNewspaper(source);
|
||||
results.set(source, result);
|
||||
} else {
|
||||
Logger.info(`Skipping disabled newspaper: ${extractor.getName()}`);
|
||||
}
|
||||
}
|
||||
|
||||
const totalStats = this.calculateTotalStats(results);
|
||||
Logger.info(`Batch extraction completed: ${totalStats.success} total success, ${totalStats.failed} total failed, ${totalStats.duplicates} total duplicates`);
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Obtiene la lista de periódicos disponibles
|
||||
*/
|
||||
getAvailableNewspapers(): { source: NewsSource; name: string; enabled: boolean }[] {
|
||||
const newspapers: { source: NewsSource; name: string; enabled: boolean }[] = [];
|
||||
|
||||
for (const [source, extractor] of this.extractors) {
|
||||
newspapers.push({
|
||||
source,
|
||||
name: extractor.getName(),
|
||||
enabled: extractor.isEnabled()
|
||||
});
|
||||
}
|
||||
|
||||
return newspapers;
|
||||
}
|
||||
|
||||
/**
|
||||
* Habilita o deshabilita un extractor específico
|
||||
*/
|
||||
setExtractorEnabled(source: NewsSource, enabled: boolean): boolean {
|
||||
const extractor = this.extractors.get(source);
|
||||
if (!extractor) {
|
||||
Logger.error(`Cannot set enabled state: No extractor found for source ${source}`);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Nota: En una implementación real, esto podría modificar la configuración
|
||||
// Por ahora, solo registramos el cambio
|
||||
Logger.info(`${enabled ? 'Enabled' : 'Disabled'} extractor for ${extractor.getName()}`);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Analiza los resultados del procesamiento
|
||||
*/
|
||||
private analyzeResults(results: (IFeed | null)[]): ScrapingResult {
|
||||
const success = results.filter(item => item !== null).length;
|
||||
const failed = results.filter(item => item === null).length;
|
||||
|
||||
return {
|
||||
success,
|
||||
failed,
|
||||
duplicates: 0, // El ScrapingService maneja duplicados internamente
|
||||
items: results,
|
||||
errors: []
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Calcula estadísticas totales de múltiples resultados
|
||||
*/
|
||||
private calculateTotalStats(results: Map<NewsSource, ScrapingResult>): ScrapingResult {
|
||||
let totalSuccess = 0;
|
||||
let totalFailed = 0;
|
||||
let totalDuplicates = 0;
|
||||
const allItems: (IFeed | null)[] = [];
|
||||
const allErrors: string[] = [];
|
||||
|
||||
for (const result of results.values()) {
|
||||
totalSuccess += result.success;
|
||||
totalFailed += result.failed;
|
||||
totalDuplicates += result.duplicates;
|
||||
allItems.push(...result.items);
|
||||
allErrors.push(...result.errors);
|
||||
}
|
||||
|
||||
return {
|
||||
success: totalSuccess,
|
||||
failed: totalFailed,
|
||||
duplicates: totalDuplicates,
|
||||
items: allItems,
|
||||
errors: allErrors
|
||||
};
|
||||
}
|
||||
}
|
225
src/services/ScrapingScheduler.ts
Normal file
225
src/services/ScrapingScheduler.ts
Normal file
@ -0,0 +1,225 @@
|
||||
import { ContentScrapingService } from './ContentScrapingService.js';
|
||||
import { IFeedRepository } from '../repositories/FeedRepository.js';
|
||||
import { Logger } from '../utils/logger.js';
|
||||
|
||||
interface ScheduleConfig {
|
||||
intervalMinutes: number;
|
||||
maxRetries: number;
|
||||
retryDelayMinutes: number;
|
||||
enabled: boolean;
|
||||
}
|
||||
|
||||
interface ScrapingStats {
|
||||
lastRun: Date | null;
|
||||
nextRun: Date | null;
|
||||
totalRuns: number;
|
||||
successfulRuns: number;
|
||||
failedRuns: number;
|
||||
totalItemsScraped: number;
|
||||
totalDuplicates: number;
|
||||
}
|
||||
|
||||
export class ScrapingScheduler {
|
||||
private contentScrapingService: ContentScrapingService;
|
||||
private scheduleConfig: ScheduleConfig;
|
||||
private stats: ScrapingStats;
|
||||
private intervalId: NodeJS.Timeout | null = null;
|
||||
private isRunning = false;
|
||||
|
||||
constructor(
|
||||
feedRepository: IFeedRepository,
|
||||
scheduleConfig: Partial<ScheduleConfig> = {}
|
||||
) {
|
||||
this.contentScrapingService = new ContentScrapingService(feedRepository);
|
||||
this.scheduleConfig = {
|
||||
intervalMinutes: 30, // Default: every 30 minutes
|
||||
maxRetries: 3,
|
||||
retryDelayMinutes: 5,
|
||||
enabled: true,
|
||||
...scheduleConfig
|
||||
};
|
||||
this.stats = {
|
||||
lastRun: null,
|
||||
nextRun: null,
|
||||
totalRuns: 0,
|
||||
successfulRuns: 0,
|
||||
failedRuns: 0,
|
||||
totalItemsScraped: 0,
|
||||
totalDuplicates: 0
|
||||
};
|
||||
}
|
||||
|
||||
start(): void {
|
||||
if (this.intervalId || !this.scheduleConfig.enabled) {
|
||||
Logger.warn('Scraping scheduler is already running or disabled');
|
||||
return;
|
||||
}
|
||||
|
||||
Logger.info(`Starting scraping scheduler with ${this.scheduleConfig.intervalMinutes} minute intervals`);
|
||||
|
||||
// Run immediately on start
|
||||
this.runScrapingCycle();
|
||||
|
||||
// Schedule recurring runs
|
||||
this.intervalId = setInterval(() => {
|
||||
this.runScrapingCycle();
|
||||
}, this.scheduleConfig.intervalMinutes * 60 * 1000);
|
||||
|
||||
this.updateNextRunTime();
|
||||
}
|
||||
|
||||
stop(): void {
|
||||
if (this.intervalId) {
|
||||
clearInterval(this.intervalId);
|
||||
this.intervalId = null;
|
||||
this.stats.nextRun = null;
|
||||
Logger.info('Scraping scheduler stopped');
|
||||
}
|
||||
}
|
||||
|
||||
async runScrapingCycle(): Promise<void> {
|
||||
if (this.isRunning) {
|
||||
Logger.warn('Scraping cycle already in progress, skipping this run');
|
||||
return;
|
||||
}
|
||||
|
||||
this.isRunning = true;
|
||||
this.stats.totalRuns++;
|
||||
this.stats.lastRun = new Date();
|
||||
|
||||
Logger.info(`Starting scraping cycle #${this.stats.totalRuns}`);
|
||||
|
||||
let retryCount = 0;
|
||||
let success = false;
|
||||
|
||||
while (retryCount <= this.scheduleConfig.maxRetries && !success) {
|
||||
try {
|
||||
const configs = ContentScrapingService.createNewsSourceConfigs();
|
||||
const results = await this.contentScrapingService.scrapeFromMultipleSources(configs);
|
||||
|
||||
// Update statistics
|
||||
let totalSuccess = 0;
|
||||
let totalDuplicates = 0;
|
||||
|
||||
for (const [sourceName, result] of results) {
|
||||
totalSuccess += result.success;
|
||||
totalDuplicates += result.duplicates;
|
||||
Logger.info(`${sourceName}: ${result.success} new, ${result.duplicates} duplicates, ${result.failed} failed`);
|
||||
}
|
||||
|
||||
this.stats.totalItemsScraped += totalSuccess;
|
||||
this.stats.totalDuplicates += totalDuplicates;
|
||||
this.stats.successfulRuns++;
|
||||
|
||||
Logger.info(`Scraping cycle completed successfully: ${totalSuccess} new items, ${totalDuplicates} duplicates`);
|
||||
success = true;
|
||||
|
||||
} catch (error) {
|
||||
retryCount++;
|
||||
Logger.error(`Scraping cycle failed (attempt ${retryCount}/${this.scheduleConfig.maxRetries + 1}):`, error);
|
||||
|
||||
if (retryCount <= this.scheduleConfig.maxRetries) {
|
||||
Logger.info(`Retrying in ${this.scheduleConfig.retryDelayMinutes} minutes...`);
|
||||
await this.delay(this.scheduleConfig.retryDelayMinutes * 60 * 1000);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!success) {
|
||||
this.stats.failedRuns++;
|
||||
Logger.error(`Scraping cycle failed after ${this.scheduleConfig.maxRetries + 1} attempts`);
|
||||
}
|
||||
|
||||
this.isRunning = false;
|
||||
this.updateNextRunTime();
|
||||
}
|
||||
|
||||
async runSingleSource(sourceName: string): Promise<void> {
|
||||
Logger.info(`Running single source scraping for: ${sourceName}`);
|
||||
|
||||
try {
|
||||
const configs = ContentScrapingService.createNewsSourceConfigs();
|
||||
const config = configs.find(c => c.name === sourceName);
|
||||
|
||||
if (!config) {
|
||||
throw new Error(`Source configuration not found: ${sourceName}`);
|
||||
}
|
||||
|
||||
const result = await this.contentScrapingService.scrapeFromSource(config);
|
||||
Logger.info(`Single source scraping completed for ${sourceName}: ${result.success} new, ${result.duplicates} duplicates, ${result.failed} failed`);
|
||||
|
||||
} catch (error) {
|
||||
Logger.error(`Single source scraping failed for ${sourceName}:`, error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
getStats(): ScrapingStats {
|
||||
return { ...this.stats };
|
||||
}
|
||||
|
||||
getConfig(): ScheduleConfig {
|
||||
return { ...this.scheduleConfig };
|
||||
}
|
||||
|
||||
updateConfig(newConfig: Partial<ScheduleConfig>): void {
|
||||
const wasRunning = this.intervalId !== null;
|
||||
|
||||
if (wasRunning) {
|
||||
this.stop();
|
||||
}
|
||||
|
||||
this.scheduleConfig = { ...this.scheduleConfig, ...newConfig };
|
||||
Logger.info('Scraping scheduler configuration updated', this.scheduleConfig);
|
||||
|
||||
if (wasRunning && this.scheduleConfig.enabled) {
|
||||
this.start();
|
||||
}
|
||||
}
|
||||
|
||||
isSchedulerRunning(): boolean {
|
||||
return this.intervalId !== null;
|
||||
}
|
||||
|
||||
isCycleRunning(): boolean {
|
||||
return this.isRunning;
|
||||
}
|
||||
|
||||
resetStats(): void {
|
||||
this.stats = {
|
||||
lastRun: null,
|
||||
nextRun: this.stats.nextRun,
|
||||
totalRuns: 0,
|
||||
successfulRuns: 0,
|
||||
failedRuns: 0,
|
||||
totalItemsScraped: 0,
|
||||
totalDuplicates: 0
|
||||
};
|
||||
Logger.info('Scraping scheduler statistics reset');
|
||||
}
|
||||
|
||||
private updateNextRunTime(): void {
|
||||
if (this.intervalId) {
|
||||
this.stats.nextRun = new Date(Date.now() + this.scheduleConfig.intervalMinutes * 60 * 1000);
|
||||
}
|
||||
}
|
||||
|
||||
private delay(ms: number): Promise<void> {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
// Graceful shutdown
|
||||
async shutdown(): Promise<void> {
|
||||
Logger.info('Shutting down scraping scheduler...');
|
||||
|
||||
this.stop();
|
||||
|
||||
// Wait for current cycle to complete if running
|
||||
while (this.isRunning) {
|
||||
Logger.info('Waiting for current scraping cycle to complete...');
|
||||
await this.delay(1000);
|
||||
}
|
||||
|
||||
Logger.info('Scraping scheduler shutdown complete');
|
||||
}
|
||||
}
|
44
src/services/ScrapingService.ts
Normal file
44
src/services/ScrapingService.ts
Normal file
@ -0,0 +1,44 @@
|
||||
import { IFeedRepository } from '../repositories/FeedRepository.js';
|
||||
import { IFeed } from '../types/Feed.js';
|
||||
|
||||
export class ScrapingService {
|
||||
constructor(private feedRepository: IFeedRepository) {}
|
||||
|
||||
getServiceName(): string {
|
||||
return 'ScrapingService';
|
||||
}
|
||||
|
||||
hasRepository(): boolean {
|
||||
return this.feedRepository !== null && this.feedRepository !== undefined;
|
||||
}
|
||||
|
||||
async getFeedCount(): Promise<number> {
|
||||
return await this.feedRepository.count();
|
||||
}
|
||||
|
||||
async saveFeedItem(feedData: Omit<IFeed, '_id' | 'createdAt' | 'updatedAt'>): Promise<IFeed> {
|
||||
return await this.feedRepository.create(feedData);
|
||||
}
|
||||
|
||||
async feedExists(url: string): Promise<boolean> {
|
||||
const existingFeed = await this.feedRepository.findByUrl(url);
|
||||
return existingFeed !== null;
|
||||
}
|
||||
|
||||
async saveIfNotExists(feedData: Omit<IFeed, '_id' | 'createdAt' | 'updatedAt'>): Promise<IFeed | null> {
|
||||
const exists = await this.feedExists(feedData.url);
|
||||
if (exists) {
|
||||
return null;
|
||||
}
|
||||
return await this.saveFeedItem(feedData);
|
||||
}
|
||||
|
||||
async processFeedBatch(feedItems: Omit<IFeed, '_id' | 'createdAt' | 'updatedAt'>[]): Promise<(IFeed | null)[]> {
|
||||
const results: (IFeed | null)[] = [];
|
||||
for (const feedItem of feedItems) {
|
||||
const result = await this.saveIfNotExists(feedItem);
|
||||
results.push(result);
|
||||
}
|
||||
return results;
|
||||
}
|
||||
}
|
36
src/types/NewspaperTypes.ts
Normal file
36
src/types/NewspaperTypes.ts
Normal file
@ -0,0 +1,36 @@
|
||||
import { NewsSource } from './Feed.js';
|
||||
import { IFeed } from './Feed.js';
|
||||
|
||||
/**
|
||||
* Interfaz para definir la configuración de extracción de un periódico
|
||||
*/
|
||||
export interface NewspaperConfig {
|
||||
name: string;
|
||||
source: NewsSource;
|
||||
baseUrl: string;
|
||||
frontPageUrl: string;
|
||||
selectors: NewsSelectors;
|
||||
enabled: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Selectores CSS para extraer elementos específicos de cada periódico
|
||||
*/
|
||||
export interface NewsSelectors {
|
||||
articleLinks: string;
|
||||
titleSelector?: string;
|
||||
descriptionSelector?: string;
|
||||
dateSelector?: string;
|
||||
imageSelector?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resultado del proceso de scraping
|
||||
*/
|
||||
export interface ScrapingResult {
|
||||
success: number;
|
||||
failed: number;
|
||||
duplicates: number;
|
||||
items: (IFeed | null)[];
|
||||
errors: string[];
|
||||
}
|
143
src/utils/WebScraper.ts
Normal file
143
src/utils/WebScraper.ts
Normal file
@ -0,0 +1,143 @@
|
||||
import { IFeed, NewsSource } from '../types/Feed.js';
|
||||
import { Logger } from './logger.js';
|
||||
|
||||
interface ScrapedData {
|
||||
title: string;
|
||||
description: string;
|
||||
url: string;
|
||||
publishedAt: Date;
|
||||
}
|
||||
|
||||
export class WebScraper {
|
||||
private userAgent = 'Mozilla/5.0 (compatible; DailyTrends/1.0)';
|
||||
|
||||
async scrapeUrl(url: string): Promise<ScrapedData | null> {
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
headers: {
|
||||
'User-Agent': this.userAgent,
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
|
||||
}
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
Logger.error(`Failed to fetch ${url}: ${response.status} ${response.statusText}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
const html = await response.text();
|
||||
return this.parseHtml(html, url);
|
||||
} catch (error) {
|
||||
Logger.error(`Error scraping ${url}:`, error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private parseHtml(html: string, url: string): ScrapedData | null {
|
||||
try {
|
||||
// Extract title from <title> tag or Open Graph
|
||||
const title = this.extractTitle(html);
|
||||
if (!title) {
|
||||
Logger.warn(`No title found for ${url}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Extract description from meta tags
|
||||
const description = this.extractDescription(html);
|
||||
if (!description) {
|
||||
Logger.warn(`No description found for ${url}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Extract published date
|
||||
const publishedAt = this.extractPublishedDate(html);
|
||||
|
||||
return {
|
||||
title: title.trim(),
|
||||
description: description.trim(),
|
||||
url,
|
||||
publishedAt
|
||||
};
|
||||
} catch (error) {
|
||||
Logger.error(`Error parsing HTML for ${url}:`, error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private extractTitle(html: string): string | null {
|
||||
// Try Open Graph title first
|
||||
const ogTitleMatch = html.match(/<meta\s+property=["']og:title["']\s+content=["']([^"']+)["']/i);
|
||||
if (ogTitleMatch) {
|
||||
return ogTitleMatch[1];
|
||||
}
|
||||
|
||||
// Try Twitter title
|
||||
const twitterTitleMatch = html.match(/<meta\s+name=["']twitter:title["']\s+content=["']([^"']+)["']/i);
|
||||
if (twitterTitleMatch) {
|
||||
return twitterTitleMatch[1];
|
||||
}
|
||||
|
||||
// Fall back to <title> tag
|
||||
const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
|
||||
if (titleMatch) {
|
||||
return titleMatch[1];
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private extractDescription(html: string): string | null {
|
||||
// Try Open Graph description first
|
||||
const ogDescMatch = html.match(/<meta\s+property=["']og:description["']\s+content=["']([^"']+)["']/i);
|
||||
if (ogDescMatch) {
|
||||
return ogDescMatch[1];
|
||||
}
|
||||
|
||||
// Try Twitter description
|
||||
const twitterDescMatch = html.match(/<meta\s+name=["']twitter:description["']\s+content=["']([^"']+)["']/i);
|
||||
if (twitterDescMatch) {
|
||||
return twitterDescMatch[1];
|
||||
}
|
||||
|
||||
// Try meta description
|
||||
const metaDescMatch = html.match(/<meta\s+name=["']description["']\s+content=["']([^"']+)["']/i);
|
||||
if (metaDescMatch) {
|
||||
return metaDescMatch[1];
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private extractPublishedDate(html: string): Date {
|
||||
// Try various date formats
|
||||
const datePatterns = [
|
||||
/<meta\s+property=["']article:published_time["']\s+content=["']([^"']+)["']/i,
|
||||
/<meta\s+name=["']pubdate["']\s+content=["']([^"']+)["']/i,
|
||||
/<time[^>]+datetime=["']([^"']+)["']/i
|
||||
];
|
||||
|
||||
for (const pattern of datePatterns) {
|
||||
const match = html.match(pattern);
|
||||
if (match) {
|
||||
const date = new Date(match[1]);
|
||||
if (!isNaN(date.getTime())) {
|
||||
return date;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Default to current date if no published date found
|
||||
return new Date();
|
||||
}
|
||||
|
||||
convertToFeedData(scrapedData: ScrapedData, source: NewsSource): Omit<IFeed, '_id' | 'createdAt' | 'updatedAt'> {
|
||||
return {
|
||||
title: scrapedData.title,
|
||||
description: scrapedData.description,
|
||||
url: scrapedData.url,
|
||||
source,
|
||||
publishedAt: scrapedData.publishedAt,
|
||||
isManual: false
|
||||
};
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user